Compare commits

..

1 Commits

Author SHA1 Message Date
Aryan 98771d3611 update 2025-02-23 13:21:01 +01:00
83 changed files with 1163 additions and 3094 deletions
+6 -6
View File
@@ -53,9 +53,9 @@ jobs:
HEADREF: ${{ steps.pr_info.outputs.headRef }}
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
run: |
echo "PR number: $PRNUMBER"
echo "Head Ref: $HEADREF"
echo "Head Repo Full Name: $HEADREPOFULLNAME"
echo "PR number: ${{ env.PRNUMBER }}"
echo "Head Ref: ${{ env.HEADREF }}"
echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"
- name: Set up Python
uses: actions/setup-python@v4
@@ -89,20 +89,20 @@ jobs:
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "HEADREPOFULLNAME: $HEADREPOFULLNAME, HEADREF: $HEADREF"
echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
# Configure git with the Actions bot user
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
# Make sure your 'origin' remote is set to the contributor's fork
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/$HEADREPOFULLNAME.git"
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
# If there are changes after running style/quality, commit them
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Apply style fixes"
# Push to the original contributor's forked branch
git push origin HEAD:$HEADREF
git push origin HEAD:${{ env.HEADREF }}
echo "changes_pushed=true" >> $GITHUB_OUTPUT
else
echo "No changes to commit."
-243
View File
@@ -1,243 +0,0 @@
name: Fast GPU Tests on PR
on:
pull_request:
branches: main
paths:
- "src/diffusers/models/modeling_utils.py"
- "src/diffusers/models/model_loading_utils.py"
- "src/diffusers/pipelines/pipeline_utils.py"
- "src/diffusers/pipeline_loading_utils.py"
- "src/diffusers/loaders/lora_base.py"
- "src/diffusers/loaders/lora_pipeline.py"
- "src/diffusers/loaders/peft.py"
- "tests/pipelines/test_pipelines_common.py"
- "tests/models/test_modeling_common.py"
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
DIFFUSERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
HF_HUB_ENABLE_HF_TRANSFER: 1
PYTEST_TIMEOUT: 600
PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
jobs:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines CUDA Slow Tests Matrix
runs-on:
group: aws-general-8-plus
container:
image: diffusers/diffusers-pytorch-cpu
outputs:
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
- name: Environment
run: |
python utils/print_env.py
- name: Fetch Pipeline Matrix
id: fetch_pipeline_matrix
run: |
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
echo $matrix
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
- name: Pipeline Tests Artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-pipelines.json
path: reports
torch_pipelines_cuda_tests:
name: Torch Pipelines CUDA Tests
needs: setup_torch_cuda_pipeline_matrix
strategy:
fail-fast: false
max-parallel: 8
matrix:
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
- name: Environment
run: |
python utils/print_env.py
- name: Extract tests
id: extract_tests
run: |
pattern=$(python utils/extract_tests_from_mixin.py --type pipeline)
echo "$pattern" > /tmp/test_pattern.txt
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
- name: PyTorch CUDA checkpoint tests on Ubuntu
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx and $pattern" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: pipeline_${{ matrix.module }}_test_reports
path: reports
torch_cuda_tests:
name: Torch CUDA Tests
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
defaults:
run:
shell: bash
strategy:
fail-fast: false
max-parallel: 2
matrix:
module: [models, schedulers, lora, others]
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
- name: Environment
run: |
python utils/print_env.py
- name: Extract tests
id: extract_tests
run: |
pattern=$(python utils/extract_tests_from_mixin.py --type ${{ matrix.module }})
echo "$pattern" > /tmp/test_pattern.txt
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
- name: Run PyTorch CUDA tests
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
if [ -z "$pattern" ]; then
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
else
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
fi
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: torch_cuda_test_reports_${{ matrix.module }}
path: reports
run_examples_tests:
name: Examples PyTorch CUDA tests on Ubuntu
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --gpus 0 --shm-size "16gb" --ipc host
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run example tests on GPU
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install timm
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/examples_torch_cuda_stats.txt
cat reports/examples_torch_cuda_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: examples_test_reports
path: reports
+11
View File
@@ -1,6 +1,13 @@
name: Fast GPU Tests on main
on:
pull_request:
branches: main
paths:
- "src/diffusers/models/modeling_utils.py"
- "src/diffusers/models/model_loading_utils.py"
- "src/diffusers/pipelines/pipeline_utils.py"
- "src/diffusers/pipeline_loading_utils.py"
workflow_dispatch:
push:
branches:
@@ -160,6 +167,7 @@ jobs:
path: reports
flax_tpu_tests:
if: ${{ github.event_name != 'pull_request' }}
name: Flax TPU Tests
runs-on:
group: gcp-ct5lp-hightpu-8t
@@ -208,6 +216,7 @@ jobs:
path: reports
onnx_cuda_tests:
if: ${{ github.event_name != 'pull_request' }}
name: ONNX CUDA Tests
runs-on:
group: aws-g4dn-2xlarge
@@ -256,6 +265,7 @@ jobs:
path: reports
run_torch_compile_tests:
if: ${{ github.event_name != 'pull_request' }}
name: PyTorch Compile CUDA tests
runs-on:
@@ -299,6 +309,7 @@ jobs:
path: reports
run_xformers_tests:
if: ${{ github.event_name != 'pull_request' }}
name: PyTorch xformers CUDA tests
runs-on:
-4
View File
@@ -543,10 +543,6 @@
title: Overview
- local: api/schedulers/cm_stochastic_iterative
title: CMStochasticIterativeScheduler
- local: api/schedulers/ddim_cogvideox
title: CogVideoXDDIMScheduler
- local: api/schedulers/multistep_dpm_solver_cogvideox
title: CogVideoXDPMScheduler
- local: api/schedulers/consistency_decoder
title: ConsistencyDecoderScheduler
- local: api/schedulers/cosine_dpm
+2 -68
View File
@@ -359,74 +359,8 @@ image.save('flux_ip_adapter_output.jpg')
<figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
</div>
## Optimize
Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeling components. Enable some of the optimizations below to lower the memory requirements.
### Group offloading
[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
> [!TIP]
> It is possible to mix block and leaf-level offloading for different components in a pipeline.
```py
import torch
from diffusers import FluxPipeline
from diffusers.hooks import apply_group_offloading
model_id = "black-forest-labs/FLUX.1-dev"
dtype = torch.bfloat16
pipe = FluxPipeline.from_pretrained(
model_id,
torch_dtype=dtype,
)
apply_group_offloading(
pipe.transformer,
offload_type="leaf_level",
offload_device=torch.device("cpu"),
onload_device=torch.device("cuda"),
use_stream=True,
)
apply_group_offloading(
pipe.text_encoder,
offload_device=torch.device("cpu"),
onload_device=torch.device("cuda"),
offload_type="leaf_level",
use_stream=True,
)
apply_group_offloading(
pipe.text_encoder_2,
offload_device=torch.device("cpu"),
onload_device=torch.device("cuda"),
offload_type="leaf_level",
use_stream=True,
)
apply_group_offloading(
pipe.vae,
offload_device=torch.device("cpu"),
onload_device=torch.device("cuda"),
offload_type="leaf_level",
use_stream=True,
)
prompt="A cat wearing sunglasses and working as a lifeguard at pool."
generator = torch.Generator().manual_seed(181201)
image = pipe(
prompt,
width=576,
height=1024,
num_inference_steps=30,
generator=generator
).images[0]
image
```
### Running FP16 inference
## Running FP16 inference
Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
@@ -455,7 +389,7 @@ out = pipe(
out.save("image.png")
```
### Quantization
## Quantization
Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
+36 -91
View File
@@ -1,6 +1,4 @@
<!--
Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
Copyright 2024-2025 The HuggingFace Team. All rights reserved.
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
@@ -12,120 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Marigold Computer Vision
# Marigold Pipelines for Computer Vision Tasks
![marigold](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)
Marigold was proposed in
[Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145),
a CVPR 2024 Oral paper by
[Bingxin Ke](http://www.kebingxin.com/),
[Anton Obukhov](https://www.obukhov.ai/),
[Shengyu Huang](https://shengyuh.github.io/),
[Nando Metzger](https://nandometzger.github.io/),
[Rodrigo Caye Daudt](https://rcdaudt.github.io/), and
[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
The core idea is to **repurpose the generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional
computer vision tasks**.
This approach was explored by fine-tuning Stable Diffusion for **Monocular Depth Estimation**, as demonstrated in the
teaser above.
Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks.
Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above.
Later,
- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).
Marigold was later extended in the follow-up paper,
[Marigold: Affordable Adaptation of Diffusion-Based Image Generators for Image Analysis](https://huggingface.co/papers/2312.02145),
authored by
[Bingxin Ke](http://www.kebingxin.com/),
[Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US),
[Tianfu Wang](https://tianfwang.github.io/),
[Nando Metzger](https://nandometzger.github.io/),
[Shengyu Huang](https://shengyuh.github.io/),
[Bo Li](https://www.linkedin.com/in/bobboli0202/),
[Anton Obukhov](https://www.obukhov.ai/), and
[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
This work expanded Marigold to support new modalities such as **Surface Normals** and **Intrinsic Image Decomposition**
(IID), introduced a training protocol for **Latent Consistency Models** (LCM), and demonstrated **High-Resolution** (HR)
processing capability.
The abstract from the paper is:
<Tip>
The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal
results in as few as 1 to 4 steps.
</Tip>
*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*
## Available Pipelines
Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a
corresponding prediction.
Currently, the following computer vision tasks are implemented:
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
Currently, the following tasks are implemented:
| Pipeline | Predicted Modalities | Demos |
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
| Pipeline | Recommended Model Checkpoints | Spaces (Interactive Apps) | Predicted Modalities |
|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) |
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1) | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) |
| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection) |
## Available Checkpoints
All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train
new model checkpoints.
The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps.
| Checkpoint | Modality | Comment |
|-----------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | Depth | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1. |
| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity. |
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
<Tip>
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff
between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to
efficiently load the same components into multiple pipelines.
Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section
[here](../../using-diffusers/svd#reduce-memory-usage).
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
</Tip>
<Tip warning={true}>
Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the
API reference).
Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration
file (`model_index.json`).
This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`.
Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference).
Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`.
This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument.
</Tip>
See also Marigold [usage examples](../../using-diffusers/marigold_usage).
## Marigold Depth Prediction API
See also Marigold [usage examples](marigold_usage).
## MarigoldDepthPipeline
[[autodoc]] MarigoldDepthPipeline
- all
- __call__
## MarigoldNormalsPipeline
[[autodoc]] MarigoldNormalsPipeline
- all
- __call__
## MarigoldDepthOutput
[[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth
## Marigold Normals Estimation API
[[autodoc]] MarigoldNormalsPipeline
- __call__
[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals
## Marigold Intrinsic Image Decomposition API
[[autodoc]] MarigoldIntrinsicsPipeline
- __call__
[[autodoc]] pipelines.marigold.pipeline_marigold_intrinsics.MarigoldIntrinsicsOutput
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics
## MarigoldNormalsOutput
[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
+1 -1
View File
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
| [Latte](latte) | text2image |
| [LEDITS++](ledits_pp) | image editing |
| [Lumina-T2X](lumina) | text2image |
| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
| [Marigold](marigold) | depth |
| [MultiDiffusion](panorama) | text2image |
| [MusicLDM](musicldm) | text2audio |
| [PAG](pag) | text2image |
@@ -1,19 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# CogVideoXDDIMScheduler
`CogVideoXDDIMScheduler` is based on [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502), specifically for CogVideoX models.
## CogVideoXDDIMScheduler
[[autodoc]] CogVideoXDDIMScheduler
@@ -1,19 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# CogVideoXDPMScheduler
`CogVideoXDPMScheduler` is based on [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095), specifically for CogVideoX models.
## CogVideoXDPMScheduler
[[autodoc]] CogVideoXDPMScheduler
+179 -318
View File
@@ -1,6 +1,4 @@
<!--
Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
Copyright 2024-2025 The HuggingFace Team. All rights reserved.
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
@@ -12,38 +10,31 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Marigold Computer Vision
# Marigold Pipelines for Computer Vision Tasks
**Marigold** is a diffusion-based [method](https://huggingface.co/papers/2312.02145) and a collection of [pipelines](../api/pipelines/marigold) designed for
dense computer vision tasks, including **monocular depth prediction**, **surface normals estimation**, and **intrinsic
image decomposition**.
[Marigold](../api/pipelines/marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.
This guide will walk you through using Marigold to generate fast and high-quality predictions for images and videos.
This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos.
Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a
corresponding prediction.
Currently, the following computer vision tasks are implemented:
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
Currently, the following tasks are implemented:
| Pipeline | Recommended Model Checkpoints | Spaces (Interactive Apps) | Predicted Modalities |
|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) |
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1) | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) |
| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection) |
| Pipeline | Predicted Modalities | Demos |
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train
new model checkpoints.
The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps.
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold).
The original code can also be used to train new checkpoints.
| Checkpoint | Modality | Comment |
|-----------------------------------------------------------------------------------------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | Depth | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1. |
| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity. |
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image \\(I\\) is comprised of Albedo \\(A\\), Diffuse shading \\(S\\), and Non-diffuse residual \\(R\\): \\(I = A*S+R\\). |
The examples below are mostly given for depth prediction, but they can be universally applied to other supported
modalities.
| Checkpoint | Modality | Comment |
|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0) | Depth | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
| [prs-eth/marigold-depth-lcm-v1-0](https://huggingface.co/prs-eth/marigold-depth-lcm-v1-0) | Depth | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. |
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.* |
| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.* |
The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities.
We showcase the predictions using the same input image of Albert Einstein generated by Midjourney.
This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.
@@ -56,21 +47,19 @@ This makes it easier to compare visualizations of the predictions across various
</div>
</div>
## Depth Prediction
### Depth Prediction Quick Start
To get a depth prediction, load the `prs-eth/marigold-depth-v1-1` checkpoint into [`MarigoldDepthPipeline`],
put the image through the pipeline, and save the predictions:
To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions:
```python
import diffusers
import torch
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(image)
vis = pipe.image_processor.visualize_depth(depth.prediction)
@@ -80,13 +69,10 @@ depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
depth_16bit[0].save("einstein_depth_16bit.png")
```
The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] function applies one of
[matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]`
depth range into an RGB image.
With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are blue.
The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image.
With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
Below are the raw and the visualized predictions. The darker and closer areas (mustache) are easier to distinguish in
the visualization.
Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:
<div class="flex gap-4">
<div style="flex: 1 1 50%; max-width: 50%;">
@@ -103,33 +89,28 @@ the visualization.
</div>
</div>
## Surface Normals Estimation
### Surface Normals Prediction Quick Start
Load the `prs-eth/marigold-normals-v1-1` checkpoint into [`MarigoldNormalsPipeline`], put the image through the
pipeline, and save the predictions:
Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions:
```python
import diffusers
import torch
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
"prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
"prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
normals = pipe(image)
vis = pipe.image_processor.visualize_normals(normals.prediction)
vis[0].save("einstein_normals.png")
```
The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional
prediction with pixel values in the range `[-1, 1]` into an RGB image.
The visualization function supports flipping surface normals axes to make the visualization compatible with other
choices of the frame of reference.
Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis
points right, `Y` axis points up, and `Z` axis points at the viewer.
The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image.
The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference.
Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
Below is the visualized prediction:
<div class="flex gap-4" style="justify-content: center; width: 100%;">
@@ -141,121 +122,25 @@ Below is the visualized prediction:
</div>
</div>
In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points
straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the
red hue.
Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue.
Points on the shoulders pointing up with a large `Y` promote green color.
## Intrinsic Image Decomposition
### Speeding up inference
Marigold provides two models for Intrinsic Image Decomposition (IID): "Appearance" and "Lighting".
Each model produces Albedo maps, derived from InteriorVerse and Hypersim annotations, respectively.
- The "Appearance" model also estimates Material properties: Roughness and Metallicity.
- The "Lighting" model generates Diffuse Shading and Non-diffuse Residual.
Here is the sample code saving predictions made by the "Appearance" model:
```python
import diffusers
import torch
pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
"prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
intrinsics = pipe(image)
vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
vis[0]["albedo"].save("einstein_albedo.png")
vis[0]["roughness"].save("einstein_roughness.png")
vis[0]["metallicity"].save("einstein_metallicity.png")
```
Another example demonstrating the predictions made by the "Lighting" model:
```python
import diffusers
import torch
pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
"prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
intrinsics = pipe(image)
vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
vis[0]["albedo"].save("einstein_albedo.png")
vis[0]["shading"].save("einstein_shading.png")
vis[0]["residual"].save("einstein_residual.png")
```
Both models share the same pipeline while supporting different decomposition types.
The exact decomposition parameterization (e.g., sRGB vs. linear space) is stored in the
`pipe.target_properties` dictionary, which is passed into the
[`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics`] function.
Below are some examples showcasing the predicted decomposition outputs.
All modalities can be inspected in the
[Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) Space.
<div class="flex gap-4">
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_albedo.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Predicted albedo ("Appearance" model)
</figcaption>
</div>
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_diffuse.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Predicted diffuse shading ("Lighting" model)
</figcaption>
</div>
</div>
## Speeding up inference
The above quick start snippets are already optimized for quality and speed, loading the checkpoint, utilizing the
`fp16` variant of weights and computation, and performing the default number (4) of denoising diffusion steps.
The first step to accelerate inference, at the expense of prediction quality, is to reduce the denoising diffusion
steps to the minimum:
The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step.
The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny):
```diff
import diffusers
import torch
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
- depth = pipe(image)
+ depth = pipe(image, num_inference_steps=1)
```
With this change, the `pipe` call completes in 280ms on RTX 3090 GPU.
Internally, the input image is first encoded using the Stable Diffusion VAE encoder, followed by a single denoising
step performed by the U-Net.
Finally, the prediction latent is decoded with the VAE decoder into pixel space.
In this setup, two out of three module calls are dedicated to converting between the pixel and latent spaces of the LDM.
Since Marigold's latent space is compatible with Stable Diffusion 2.0, inference can be accelerated by more than 3x,
reducing the call time to 85ms on an RTX 3090, by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny).
Note that using a lightweight VAE may slightly reduce the visual quality of the predictions.
```diff
import diffusers
import torch
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")
+ pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
@@ -263,77 +148,78 @@ Note that using a lightweight VAE may slightly reduce the visual quality of the
+ ).cuda()
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(image, num_inference_steps=1)
depth = pipe(image)
```
So far, we have optimized the number of diffusion steps and model components. Self-attention operations account for a
significant portion of computations.
Speeding them up can be achieved by using a more efficient attention processor:
As suggested in [Optimizations](../optimization/torch2.0#torch.compile), adding `torch.compile` may squeeze extra performance depending on the target hardware:
```diff
import diffusers
import torch
+ from diffusers.models.attention_processor import AttnProcessor2_0
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")
+ pipe.vae.set_attn_processor(AttnProcessor2_0())
+ pipe.unet.set_attn_processor(AttnProcessor2_0())
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(image, num_inference_steps=1)
```
Finally, as suggested in [Optimizations](../optimization/torch2.0#torch.compile), enabling `torch.compile` can further enhance performance depending on
the target hardware.
However, compilation incurs a significant overhead during the first pipeline invocation, making it beneficial only when
the same pipeline instance is called repeatedly, such as within a loop.
```diff
import diffusers
import torch
from diffusers.models.attention_processor import AttnProcessor2_0
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
pipe.vae.set_attn_processor(AttnProcessor2_0())
pipe.unet.set_attn_processor(AttnProcessor2_0())
+ pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(image, num_inference_steps=1)
depth = pipe(image)
```
## Qualitative Comparison with Depth Anything
With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
<div class="flex gap-4">
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Marigold LCM fp16 with Tiny AutoEncoder
</figcaption>
</div>
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/einstein_depthanything_large.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Depth Anything Large
</figcaption>
</div>
</div>
## Maximizing Precision and Ensembling
Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents.
This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
The ensembling path is activated automatically when the `ensemble_size` argument is set greater or equal than `3`.
The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`.
When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`.
The recommended values vary across checkpoints but primarily depend on the scheduler type.
The effect of ensembling is particularly well-seen with surface normals:
```diff
import diffusers
```python
import diffusers
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained("prs-eth/marigold-normals-v1-1").to("cuda")
model_path = "prs-eth/marigold-normals-v1-0"
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
model_paper_kwargs = {
diffusers.schedulers.DDIMScheduler: {
"num_inference_steps": 10,
"ensemble_size": 10,
},
diffusers.schedulers.LCMScheduler: {
"num_inference_steps": 4,
"ensemble_size": 5,
},
}
- depth = pipe(image)
+ depth = pipe(image, num_inference_steps=10, ensemble_size=5)
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
vis = pipe.image_processor.visualize_normals(depth.prediction)
vis[0].save("einstein_normals.png")
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
depth = pipe(image, **pipe_kwargs)
vis = pipe.image_processor.visualize_normals(depth.prediction)
vis[0].save("einstein_normals.png")
```
<div class="flex gap-4">
@@ -351,16 +237,93 @@ The effect of ensembling is particularly well-seen with surface normals:
</div>
</div>
As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more
correct predictions.
As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.
## Quantitative Evaluation
To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`.
Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
```python
import diffusers
import torch
device = "cuda"
seed = 2024
model_path = "prs-eth/marigold-v1-0"
model_paper_kwargs = {
diffusers.schedulers.DDIMScheduler: {
"num_inference_steps": 50,
"ensemble_size": 10,
},
diffusers.schedulers.LCMScheduler: {
"num_inference_steps": 4,
"ensemble_size": 10,
},
}
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
generator = torch.Generator(device=device).manual_seed(seed)
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
depth = pipe(image, generator=generator, **pipe_kwargs)
# evaluate metrics
```
## Using Predictive Uncertainty
The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents.
As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
The resulting uncertainty will be available in the `uncertainty` field of the output.
It can be visualized as follows:
```python
import diffusers
import torch
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(
image,
ensemble_size=10, # any number greater than 1; higher values yield higher precision
output_uncertainty=True,
)
uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
uncertainty[0].save("einstein_depth_uncertainty.png")
```
<div class="flex gap-4">
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Depth uncertainty
</figcaption>
</div>
<div style="flex: 1 1 50%; max-width: 50%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Surface normals uncertainty
</figcaption>
</div>
</div>
The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
## Frame-by-frame Video Processing with Temporal Consistency
Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent
initialization.
This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the
following videos:
Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization.
This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:
<div class="flex gap-4">
<div style="flex: 1 1 50%; max-width: 50%;">
@@ -373,32 +336,26 @@ following videos:
</div>
</div>
To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of
diffusion.
Empirically, we found that a convex combination of the very same starting point noise latent and the latent
corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
```python
import imageio
import diffusers
import torch
from diffusers.models.attention_processor import AttnProcessor2_0
from PIL import Image
from tqdm import tqdm
import diffusers
import torch
device = "cuda"
path_in = "https://huggingface.co/spaces/prs-eth/marigold-lcm/resolve/c7adb5427947d2680944f898cd91d386bf0d4924/files/video/obama.mp4"
path_in = "obama.mp4"
path_out = "obama_depth.gif"
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to(device)
pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
"madebyollin/taesd", torch_dtype=torch.float16
).to(device)
pipe.unet.set_attn_processor(AttnProcessor2_0())
pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
pipe.set_progress_bar_config(disable=True)
with imageio.get_reader(path_in) as reader:
@@ -416,11 +373,7 @@ with imageio.get_reader(path_in) as reader:
latents = 0.9 * latents + 0.1 * last_frame_latent
depth = pipe(
frame,
num_inference_steps=1,
match_input_resolution=False,
latents=latents,
output_latent=True,
frame, match_input_resolution=False, latents=latents, output_latent=True
)
last_frame_latent = depth.latent
out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
@@ -429,8 +382,7 @@ with imageio.get_reader(path_in) as reader:
```
Here, the diffusion process starts from the given computed latent.
The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent
initialization.
The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
The result is much more stable now:
<div class="flex gap-4">
@@ -462,7 +414,7 @@ image = diffusers.utils.load_image(
)
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", torch_dtype=torch.float16, variant="fp16"
"prs-eth/marigold-depth-lcm-v1-0", torch_dtype=torch.float16, variant="fp16"
).to(device)
depth_image = pipe(image, generator=generator).prediction
@@ -511,95 +463,4 @@ controlnet_out[0].save("motorcycle_controlnet_out.png")
</div>
</div>
## Quantitative Evaluation
To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets),
follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values
for `num_inference_steps` and `ensemble_size`.
Optionally seed randomness to ensure reproducibility.
Maximizing `batch_size` will deliver maximum device utilization.
```python
import diffusers
import torch
device = "cuda"
seed = 2024
generator = torch.Generator(device=device).manual_seed(seed)
pipe = diffusers.MarigoldDepthPipeline.from_pretrained("prs-eth/marigold-depth-v1-1").to(device)
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(
image,
num_inference_steps=4, # set according to the evaluation protocol from the paper
ensemble_size=10, # set according to the evaluation protocol from the paper
generator=generator,
)
# evaluate metrics
```
## Using Predictive Uncertainty
The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random
latents.
As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater
or equal than 3 and set `output_uncertainty=True`.
The resulting uncertainty will be available in the `uncertainty` field of the output.
It can be visualized as follows:
```python
import diffusers
import torch
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
depth = pipe(
image,
ensemble_size=10, # any number >= 3
output_uncertainty=True,
)
uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
uncertainty[0].save("einstein_depth_uncertainty.png")
```
<div class="flex gap-4">
<div style="flex: 1 1 33%; max-width: 33%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Depth uncertainty
</figcaption>
</div>
<div style="flex: 1 1 33%; max-width: 33%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Surface normals uncertainty
</figcaption>
</div>
<div style="flex: 1 1 33%; max-width: 33%;">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/4f83035d84a24e5ec44fdda129b1d51eba12ce04/marigold/marigold_einstein_albedo_uncertainty.png"/>
<figcaption class="mt-1 text-center text-sm text-gray-500">
Albedo uncertainty
</figcaption>
</div>
</div>
The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to
make consistent predictions.
- The depth model exhibits the most uncertainty around discontinuities, where object depth changes abruptly.
- The surface normals model is least confident in fine-grained structures like hair and in dark regions such as the
collar area.
- Albedo uncertainty is represented as an RGB image, as it captures uncertainty independently for each color channel,
unlike depth and surface normals. It is also higher in shaded regions and at discontinuities.
## Conclusion
We hope Marigold proves valuable for your downstream tasks, whether as part of a broader generative workflow or for
perception-based applications like 3D reconstruction.
Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a perception task, such as 3D reconstruction.
+184 -149
View File
@@ -215,7 +215,7 @@ image
Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
<Tip>
@@ -223,99 +223,136 @@ If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open
</Tip>
This guide will show you how to weight your prompts with sd_embed.
This guide will show you how to weight and blend your prompts with Compel in 🤗 Diffusers.
Before you begin, make sure you have the latest version of sd_embed installed:
```bash
pip install git+https://github.com/xhinker/sd_embed.git@main
```
For this example, let's use [`StableDiffusionXLPipeline`].
Before you begin, make sure you have the latest version of Compel installed:
```py
from diffusers import StableDiffusionXLPipeline, UniPCMultistepScheduler
# uncomment to install in Colab
#!pip install compel --upgrade
```
For this guide, let's generate an image with the prompt `"a red cat playing with a ball"` using the [`StableDiffusionPipeline`]:
```py
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
import torch
pipe = StableDiffusionXLPipeline.from_pretrained("Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_safetensors=True)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
```
To upweight or downweight a concept, surround the text with parentheses. More parentheses applies a heavier weight on the text. You can also append a numerical multiplier to the text to indicate how much you want to increase or decrease its weights by.
prompt = "a red cat playing with a ball"
| format | multiplier |
|---|---|
| `(hippo)` | increase by 1.1x |
| `((hippo))` | increase by 1.21x |
| `(hippo:1.5)` | increase by 1.5x |
| `(hippo:0.5)` | decrease by 4x |
generator = torch.Generator(device="cpu").manual_seed(33)
Create a prompt and use a combination of parentheses and numerical multipliers to upweight various text.
```py
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sdxl
prompt = """A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
```
Use the `get_weighted_text_embeddings_sdxl` function to generate the prompt embeddings and the negative prompt embeddings. It'll also generated the pooled and negative pooled prompt embeddings since you're using the SDXL model.
> [!TIP]
> You can safely ignore the error message below about the token index length exceeding the models maximum sequence length. All your tokens will be used in the embedding process.
>
> ```
> Token indices sequence length is longer than the specified maximum sequence length for this model
> ```
```py
(
prompt_embeds,
prompt_neg_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds
) = get_weighted_text_embeddings_sdxl(
pipe,
prompt=prompt,
neg_prompt=neg_prompt
)
image = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=prompt_neg_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
num_inference_steps=30,
height=1024,
width=1024 + 512,
guidance_scale=4.0,
generator=torch.Generator("cuda").manual_seed(2)
).images[0]
image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_sdxl.png"/>
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
</div>
> [!TIP]
> Refer to the [sd_embed](https://github.com/xhinker/sd_embed) repository for additional details about long prompt weighting for FLUX.1, Stable Cascade, and Stable Diffusion 1.5.
### Weighting
You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:
```py
from compel import Compel
compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
```
compel uses `+` or `-` to increase or decrease the weight of a word in the prompt. To increase the weight of "ball":
<Tip>
`+` corresponds to the value `1.1`, `++` corresponds to `1.1^2`, and so on. Similarly, `-` corresponds to `0.9` and `--` corresponds to `0.9^2`. Feel free to experiment with adding more `+` or `-` in your prompt!
</Tip>
```py
prompt = "a red cat playing with a ball++"
```
Pass the prompt to `compel_proc` to create the new prompt embeddings which are passed to the pipeline:
```py
prompt_embeds = compel_proc(prompt)
generator = torch.manual_seed(33)
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_1.png"/>
</div>
To downweight parts of the prompt, use the `-` suffix:
```py
prompt = "a red------- cat playing with a ball"
prompt_embeds = compel_proc(prompt)
generator = torch.manual_seed(33)
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"/>
</div>
You can even up or downweight multiple concepts in the same prompt:
```py
prompt = "a red cat++ playing with a ball----"
prompt_embeds = compel_proc(prompt)
generator = torch.manual_seed(33)
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
</div>
### Blending
You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!
```py
prompt_embeds = compel_proc('("a red cat playing with a ball", "jungle").blend(0.7, 0.8)')
generator = torch.Generator(device="cuda").manual_seed(33)
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
</div>
### Conjunction
A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:
```py
prompt_embeds = compel_proc('["a red cat", "playing with a", "ball"].and()')
generator = torch.Generator(device="cuda").manual_seed(55)
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
image
```
<div class="flex justify-center">
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
</div>
### Textual inversion
@@ -326,63 +363,35 @@ Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textua
```py
import torch
from diffusers import StableDiffusionPipeline
from compel import Compel, DiffusersTextualInversionManager
pipe = StableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5",
torch_dtype=torch.float16,
).to("cuda")
"stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16,
use_safetensors=True, variant="fp16").to("cuda")
pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
```
Add the `<midjourney-style>` text to the prompt to trigger the textual inversion.
Compel provides a `DiffusersTextualInversionManager` class to simplify prompt weighting with textual inversion. Instantiate `DiffusersTextualInversionManager` and pass it to the `Compel` class:
```py
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
prompt = """<midjourney-style> A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
textual_inversion_manager = DiffusersTextualInversionManager(pipe)
compel_proc = Compel(
tokenizer=pipe.tokenizer,
text_encoder=pipe.text_encoder,
textual_inversion_manager=textual_inversion_manager)
```
Use the `get_weighted_text_embeddings_sd15` function to generate the prompt embeddings and the negative prompt embeddings.
Incorporate the concept to condition a prompt with using the `<concept>` syntax:
```py
(
prompt_embeds,
prompt_neg_embeds,
) = get_weighted_text_embeddings_sd15(
pipe,
prompt=prompt,
neg_prompt=neg_prompt
)
prompt_embeds = compel_proc('("A red cat++ playing with a ball <midjourney-style>")')
image = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=prompt_neg_embeds,
height=768,
width=896,
guidance_scale=4.0,
generator=torch.Generator("cuda").manual_seed(2)
).images[0]
image = pipe(prompt_embeds=prompt_embeds).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_textual_inversion.png"/>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
</div>
### DreamBooth
@@ -392,44 +401,70 @@ image
```py
import torch
from diffusers import DiffusionPipeline, UniPCMultistepScheduler
from compel import Compel
pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
```
Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
Create a `Compel` class with a tokenizer and text encoder, and pass your prompt to it. Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
```py
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
prompt = """dndcoverart of A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
(
prompt_embeds
, prompt_neg_embeds
) = get_weighted_text_embeddings_sd15(
pipe
, prompt = prompt
, neg_prompt = neg_prompt
)
compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
prompt_embeds = compel_proc('("magazine cover of a dndcoverart dragon, high quality, intricate details, larry elmore art style").and()')
image = pipe(prompt_embeds=prompt_embeds).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_dreambooth.png"/>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
</div>
### Stable Diffusion XL
Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:
```py
from compel import Compel, ReturnedEmbeddingsType
from diffusers import DiffusionPipeline
from diffusers.utils import make_image_grid
import torch
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
variant="fp16",
use_safetensors=True,
torch_dtype=torch.float16
).to("cuda")
compel = Compel(
tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
requires_pooled=[False, True]
)
```
This time, let's upweight "ball" by a factor of 1.5 for the first prompt, and downweight "ball" by 0.6 for the second prompt. The [`StableDiffusionXLPipeline`] also requires [`pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.pooled_prompt_embeds) (and optionally [`negative_pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.negative_pooled_prompt_embeds)) so you should pass those to the pipeline along with the conditioning tensors:
```py
# apply weights
prompt = ["a red cat playing with a (ball)1.5", "a red cat playing with a (ball)0.6"]
conditioning, pooled = compel(prompt)
# generate image
generator = [torch.Generator().manual_seed(33) for _ in range(len(prompt))]
images = pipeline(prompt_embeds=conditioning, pooled_prompt_embeds=pooled, generator=generator, num_inference_steps=30).images
make_image_grid(images, rows=1, cols=2)
```
<div class="flex gap-4">
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball1.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)1.5"</figcaption>
</div>
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball2.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)0.6"</figcaption>
</div>
</div>
+1 -5
View File
@@ -92,13 +92,9 @@ class CheckpointMergerPipeline(DiffusionPipeline):
token = kwargs.pop("token", None)
variant = kwargs.pop("variant", None)
revision = kwargs.pop("revision", None)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
device_map = kwargs.pop("device_map", None)
if not isinstance(torch_dtype, torch.dtype):
torch_dtype = torch.float32
print(f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`.")
alpha = kwargs.pop("alpha", 0.5)
interp = kwargs.pop("interp", None)
@@ -203,7 +203,7 @@ def log_validation(
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
pipeline = pipeline.to(accelerator.device)
pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
pipeline.set_progress_bar_config(disable=True)
# run inference
@@ -213,7 +213,7 @@ def log_validation(
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
autocast_ctx = nullcontext()
else:
autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
autocast_ctx = torch.autocast(accelerator.device.type)
with autocast_ctx:
images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
-3
View File
@@ -345,7 +345,6 @@ else:
"Lumina2Text2ImgPipeline",
"LuminaText2ImgPipeline",
"MarigoldDepthPipeline",
"MarigoldIntrinsicsPipeline",
"MarigoldNormalsPipeline",
"MochiPipeline",
"MusicLDMPipeline",
@@ -846,7 +845,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
Lumina2Text2ImgPipeline,
LuminaText2ImgPipeline,
MarigoldDepthPipeline,
MarigoldIntrinsicsPipeline,
MarigoldNormalsPipeline,
MochiPipeline,
MusicLDMPipeline,
@@ -867,7 +865,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
StableCascadeCombinedPipeline,
StableCascadeDecoderPipeline,
StableCascadePriorPipeline,
StableDiffusion3ControlNetInpaintingPipeline,
StableDiffusion3ControlNetPipeline,
StableDiffusion3Img2ImgPipeline,
StableDiffusion3InpaintPipeline,
+30
View File
@@ -0,0 +1,30 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..models.attention_processor import Attention, MochiAttention
_ATTENTION_CLASSES = (Attention, MochiAttention)
_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "layers")
_ALL_TRANSFORMER_BLOCK_IDENTIFIERS = tuple(
{
*_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
*_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
*_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
}
)
+262
View File
@@ -0,0 +1,262 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
from dataclasses import dataclass
from typing import Tuple, Union
import torch
from ..utils import get_logger
from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS
from .hooks import HookRegistry, ModelHook
from .utils import _extract_return_information
logger = get_logger(__name__) # pylint: disable=invalid-name
_FBC_LEADER_BLOCK_HOOK = "fbc_leader_block_hook"
_FBC_BLOCK_HOOK = "fbc_block_hook"
@dataclass
class FirstBlockCacheConfig:
r"""
Configuration for [First Block
Cache](https://github.com/chengzeyi/ParaAttention/blob/7a266123671b55e7e5a2fe9af3121f07a36afc78/README.md#first-block-cache-our-dynamic-caching).
Args:
threshold (`float`, defaults to `0.05`):
The threshold to determine whether or not a forward pass through all layers of the model is required. A
higher threshold usually results in lower number of forward passes and faster inference, but might lead to
poorer generation quality. A lower threshold may not result in significant generation speedup. The
threshold is compared against the absmean difference of the residuals between the current and cached
outputs from the first transformer block. If the difference is below the threshold, the forward pass is
skipped.
"""
threshold: float = 0.05
class FBCSharedBlockState:
def __init__(self) -> None:
self.head_block_output: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
self.head_block_residual: torch.Tensor = None
self.tail_block_residuals: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
self.should_compute: bool = True
def reset(self):
self.tail_block_residuals = None
self.should_compute = True
def __repr__(self):
return f"FirstBlockCacheSharedState(cache={self.cache})"
class FBCHeadBlockHook(ModelHook):
_is_stateful = True
def __init__(self, shared_state: FBCSharedBlockState, threshold: float):
self.shared_state = shared_state
self.threshold = threshold
def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
inputs = inspect.signature(module.__class__.forward)
inputs_index_to_str = dict(enumerate(inputs.parameters.keys()))
inputs_str_to_index = {v: k for k, v in inputs_index_to_str.items()}
try:
outputs = _extract_return_information(module.__class__.forward)
outputs_index_to_str = dict(enumerate(outputs))
outputs_str_to_index = {v: k for k, v in outputs_index_to_str.items()}
except RuntimeError:
logger.error(f"Failed to extract return information for {module.__class__}")
raise NotImplementedError(
f"Module {module.__class__} is not supported with FirstBlockCache. Please open an issue at "
f"https://github.com/huggingface/diffusers to notify us about the error with a minimal example "
f"in order for us to add support for this module."
)
self._inputs_index_to_str = inputs_index_to_str
self._inputs_str_to_index = inputs_str_to_index
self._outputs_index_to_str = outputs_index_to_str
self._outputs_str_to_index = outputs_str_to_index
return module
def new_forward(self, module: torch.nn.Module, *args, **kwargs):
hs_input_idx = self._inputs_str_to_index.get("hidden_states")
ehs_input_idx = self._inputs_str_to_index.get("encoder_hidden_states", None)
original_hs = kwargs.get("hidden_states", None)
original_ehs = kwargs.get("encoder_hidden_states", None)
original_hs = original_hs if original_hs is not None else args[hs_input_idx]
if ehs_input_idx is not None:
original_ehs = original_ehs if original_ehs is not None else args[ehs_input_idx]
hs_output_idx = self._outputs_str_to_index.get("hidden_states")
ehs_output_idx = self._outputs_str_to_index.get("encoder_hidden_states", None)
assert (ehs_input_idx is None) == (ehs_output_idx is None)
output = self.fn_ref.original_forward(*args, **kwargs)
hs_residual = None
if isinstance(output, tuple):
hs_residual = output[hs_output_idx] - original_hs
else:
hs_residual = output - original_hs
should_compute = self._should_compute_remaining_blocks(hs_residual)
self.shared_state.should_compute = should_compute
hs, ehs = None, None
if not should_compute:
# Apply caching
logger.info("Skipping forward pass through remaining blocks")
hs = self.shared_state.tail_block_residuals[0] + output[hs_output_idx]
if ehs_output_idx is not None:
ehs = self.shared_state.tail_block_residuals[1] + output[ehs_output_idx]
if isinstance(output, tuple):
return_output = [None] * len(output)
return_output[hs_output_idx] = hs
return_output[ehs_output_idx] = ehs
return_output = tuple(return_output)
else:
return_output = hs
return return_output
else:
logger.info("Computing forward pass through remaining blocks")
if isinstance(output, tuple):
head_block_output = [None] * len(output)
head_block_output[0] = output[hs_output_idx]
head_block_output[1] = output[ehs_output_idx]
else:
head_block_output = output
self.shared_state.head_block_output = head_block_output
self.shared_state.head_block_residual = hs_residual
return output
def reset_state(self, module):
self.shared_state.reset()
return module
def _should_compute_remaining_blocks(self, hs_residual: torch.Tensor) -> bool:
if self.shared_state.head_block_residual is None:
return True
prev_hs_residual = self.shared_state.head_block_residual
hs_absmean = (hs_residual - prev_hs_residual).abs().mean()
prev_hs_mean = prev_hs_residual.abs().mean()
diff = (hs_absmean / prev_hs_mean).item()
logger.info(f"Diff: {diff}, Threshold: {self.threshold}")
return diff > self.threshold
class FBCBlockHook(ModelHook):
def __init__(self, shared_state: FBCSharedBlockState, is_tail: bool = False):
super().__init__()
self.shared_state = shared_state
self.is_tail = is_tail
def initialize_hook(self, module):
inputs = inspect.signature(module.__class__.forward)
inputs_index_to_str = dict(enumerate(inputs.parameters.keys()))
inputs_str_to_index = {v: k for k, v in inputs_index_to_str.items()}
try:
outputs = _extract_return_information(module.__class__.forward)
outputs_index_to_str = dict(enumerate(outputs))
outputs_str_to_index = {v: k for k, v in outputs_index_to_str.items()}
except RuntimeError:
logger.error(f"Failed to extract return information for {module.__class__}")
raise NotImplementedError(
f"Module {module.__class__} is not supported with FirstBlockCache. Please open an issue at "
f"https://github.com/huggingface/diffusers to notify us about the error with a minimal example "
f"in order for us to add support for this module."
)
self._inputs_index_to_str = inputs_index_to_str
self._inputs_str_to_index = inputs_str_to_index
self._outputs_index_to_str = outputs_index_to_str
self._outputs_str_to_index = outputs_str_to_index
return module
def new_forward(self, module: torch.nn.Module, *args, **kwargs):
hs_input_idx = self._inputs_str_to_index.get("hidden_states")
ehs_input_idx = self._inputs_str_to_index.get("encoder_hidden_states", None)
original_hs = kwargs.get("hidden_states", None)
original_ehs = kwargs.get("encoder_hidden_states", None)
original_hs = original_hs if original_hs is not None else args[hs_input_idx]
if ehs_input_idx is not None:
original_ehs = original_ehs if original_ehs is not None else args[ehs_input_idx]
hs_output_idx = self._outputs_str_to_index.get("hidden_states")
ehs_output_idx = self._outputs_str_to_index.get("encoder_hidden_states", None)
assert (ehs_input_idx is None) == (ehs_output_idx is None)
if self.shared_state.should_compute:
output = self.fn_ref.original_forward(*args, **kwargs)
if self.is_tail:
hs_residual, ehs_residual = None, None
if isinstance(output, tuple):
hs_residual = output[hs_output_idx] - self.shared_state.head_block_output[0]
ehs_residual = output[ehs_output_idx] - self.shared_state.head_block_output[1]
else:
hs_residual = output - self.shared_state.head_block_output
self.shared_state.tail_block_residuals = (hs_residual, ehs_residual)
return output
output_count = len(self._outputs_index_to_str.keys())
return_output = [None] * output_count if output_count > 1 else original_hs
if output_count == 1:
return_output = original_hs
else:
return_output[hs_output_idx] = original_hs
return_output[ehs_output_idx] = original_ehs
return return_output
def apply_first_block_cache(module: torch.nn.Module, config: FirstBlockCacheConfig) -> None:
shared_state = FBCSharedBlockState()
remaining_blocks = []
for name, submodule in module.named_children():
if name not in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS or not isinstance(submodule, torch.nn.ModuleList):
continue
for block in submodule:
remaining_blocks.append((name, block))
head_block_name, head_block = remaining_blocks.pop(0)
tail_block_name, tail_block = remaining_blocks.pop(-1)
logger.debug(f"Apply FBCHeadBlockHook to '{head_block_name}'")
apply_fbc_head_block_hook(head_block, shared_state, config.threshold)
for name, block in remaining_blocks:
logger.debug(f"Apply FBCBlockHook to '{name}'")
apply_fbc_block_hook(block, shared_state)
logger.debug(f"Apply FBCBlockHook to tail block '{tail_block_name}'")
apply_fbc_block_hook(tail_block, shared_state, is_tail=True)
def apply_fbc_head_block_hook(block: torch.nn.Module, state: FBCSharedBlockState, threshold: float) -> None:
registry = HookRegistry.check_if_exists_or_initialize(block)
hook = FBCHeadBlockHook(state, threshold)
registry.register_hook(hook, _FBC_LEADER_BLOCK_HOOK)
def apply_fbc_block_hook(block: torch.nn.Module, state: FBCSharedBlockState, is_tail: bool = False) -> None:
registry = HookRegistry.check_if_exists_or_initialize(block)
hook = FBCBlockHook(state, is_tail)
registry.register_hook(hook, _FBC_BLOCK_HOOK)
@@ -20,19 +20,18 @@ import torch
from ..models.attention_processor import Attention, MochiAttention
from ..utils import logging
from ._common import (
_ATTENTION_CLASSES,
_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
)
from .hooks import HookRegistry, ModelHook
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
_ATTENTION_CLASSES = (Attention, MochiAttention)
_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks")
_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
_CROSS_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks")
@dataclass
class PyramidAttentionBroadcastConfig:
r"""
@@ -76,9 +75,9 @@ class PyramidAttentionBroadcastConfig:
temporal_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
cross_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS
temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_ATTENTION_BLOCK_IDENTIFIERS
spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS
temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS
cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS
current_timestep_callback: Callable[[], int] = None
+59
View File
@@ -0,0 +1,59 @@
import ast
import inspect
import textwrap
from typing import List
def _extract_return_information(func) -> List[str]:
"""Extracts return variable names in order from a function."""
try:
source = inspect.getsource(func)
source = textwrap.dedent(source) # Modify indentation to make parsing compatible
except (OSError, TypeError):
try:
source_file = inspect.getfile(func)
with open(source_file, "r", encoding="utf-8") as f:
source = f.read()
# Extract function definition manually
source_lines = source.splitlines()
func_name = func.__name__
start_line = None
indent_level = None
extracted_lines = []
for i, line in enumerate(source_lines):
stripped = line.strip()
if stripped.startswith(f"def {func_name}("):
start_line = i
indent_level = len(line) - len(line.lstrip())
extracted_lines.append(line)
continue
if start_line is not None:
# Stop when indentation level decreases (end of function)
current_indent = len(line) - len(line.lstrip())
if current_indent <= indent_level and line.strip():
break
extracted_lines.append(line)
source = "\n".join(extracted_lines)
except Exception as e:
raise RuntimeError(f"Failed to retrieve function source: {e}")
# Parse source code using AST
tree = ast.parse(source)
return_vars = []
class ReturnVisitor(ast.NodeVisitor):
def visit_Return(self, node):
if isinstance(node.value, ast.Tuple):
# Multiple return values
return_vars.extend(var.id for var in node.value.elts if isinstance(var, ast.Name))
elif isinstance(node.value, ast.Name):
# Single return value
return_vars.append(node.value.id)
visitor = ReturnVisitor()
visitor.visit(tree)
return return_vars
+20 -29
View File
@@ -23,9 +23,7 @@ from safetensors import safe_open
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
from ..utils import (
USE_PEFT_BACKEND,
_get_detailed_type,
_get_model_file,
_is_valid_type,
is_accelerate_available,
is_torch_version,
is_transformers_available,
@@ -579,36 +577,29 @@ class FluxIPAdapterMixin:
pipeline.set_ip_adapter_scale(ip_strengths)
```
"""
scale_type = Union[int, float]
num_ip_adapters = self.transformer.encoder_hid_proj.num_ip_adapters
num_layers = self.transformer.config.num_layers
# Single value for all layers of all IP-Adapters
if isinstance(scale, scale_type):
scale = [scale for _ in range(num_ip_adapters)]
# List of per-layer scales for a single IP-Adapter
elif _is_valid_type(scale, List[scale_type]) and num_ip_adapters == 1:
transformer = self.transformer
if not isinstance(scale, list):
scale = [[scale] * transformer.config.num_layers]
elif isinstance(scale, list) and isinstance(scale[0], int) or isinstance(scale[0], float):
if len(scale) != transformer.config.num_layers:
raise ValueError(f"Expected list of {transformer.config.num_layers} scales, got {len(scale)}.")
scale = [scale]
# Invalid scale type
elif not _is_valid_type(scale, List[Union[scale_type, List[scale_type]]]):
raise TypeError(f"Unexpected type {_get_detailed_type(scale)} for scale.")
if len(scale) != num_ip_adapters:
raise ValueError(f"Cannot assign {len(scale)} scales to {num_ip_adapters} IP-Adapters.")
scale_configs = scale
if any(len(s) != num_layers for s in scale if isinstance(s, list)):
invalid_scale_sizes = {len(s) for s in scale if isinstance(s, list)} - {num_layers}
raise ValueError(
f"Expected list of {num_layers} scales, got {', '.join(str(x) for x in invalid_scale_sizes)}."
)
# Scalars are transformed to lists with length num_layers
scale_configs = [[s] * num_layers if isinstance(s, scale_type) else s for s in scale]
# Set scales. zip over scale_configs prevents going into single transformer layers
for attn_processor, *scale in zip(self.transformer.attn_processors.values(), *scale_configs):
attn_processor.scale = scale
key_id = 0
for attn_name, attn_processor in transformer.attn_processors.items():
if isinstance(attn_processor, (FluxIPAdapterJointAttnProcessor2_0)):
if len(scale_configs) != len(attn_processor.scale):
raise ValueError(
f"Cannot assign {len(scale_configs)} scale_configs to "
f"{len(attn_processor.scale)} IP-Adapter."
)
elif len(scale_configs) == 1:
scale_configs = scale_configs * len(attn_processor.scale)
for i, scale_config in enumerate(scale_configs):
attn_processor.scale[i] = scale_config[key_id]
key_id += 1
def unload_ip_adapter(self):
"""
+9 -26
View File
@@ -63,9 +63,6 @@ def _maybe_adjust_config(config):
method removes the ambiguity by following what is described here:
https://github.com/huggingface/diffusers/pull/9985#issuecomment-2493840028.
"""
# Track keys that have been explicitly removed to prevent re-adding them.
deleted_keys = set()
rank_pattern = config["rank_pattern"].copy()
target_modules = config["target_modules"]
original_r = config["r"]
@@ -83,22 +80,21 @@ def _maybe_adjust_config(config):
ambiguous_key = key
if exact_matches and substring_matches:
# if ambiguous, update the rank associated with the ambiguous key (`proj_out`, for example)
# if ambiguous we update the rank associated with the ambiguous key (`proj_out`, for example)
config["r"] = key_rank
# remove the ambiguous key from `rank_pattern` and record it as deleted
# remove the ambiguous key from `rank_pattern` and update its rank to `r`, instead
del config["rank_pattern"][key]
deleted_keys.add(key)
# For substring matches, add them with the original rank only if they haven't been assigned already
for mod in substring_matches:
if mod not in config["rank_pattern"] and mod not in deleted_keys:
# avoid overwriting if the module already has a specific rank
if mod not in config["rank_pattern"]:
config["rank_pattern"][mod] = original_r
# Update the rest of the target modules with the original rank if not already set and not deleted
# update the rest of the keys with the `original_r`
for mod in target_modules:
if mod != ambiguous_key and mod not in config["rank_pattern"] and mod not in deleted_keys:
if mod != ambiguous_key and mod not in config["rank_pattern"]:
config["rank_pattern"][mod] = original_r
# Handle alphas to deal with cases like:
# handle alphas to deal with cases like
# https://github.com/huggingface/diffusers/pull/9999#issuecomment-2516180777
has_different_ranks = len(config["rank_pattern"]) > 1 and list(config["rank_pattern"])[0] != config["r"]
if has_different_ranks:
@@ -191,11 +187,6 @@ class PeftAdapterMixin:
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
from peft.tuners.tuners_utils import BaseTunerLayer
try:
from peft.utils.constants import FULLY_QUALIFIED_PATTERN_KEY_PREFIX
except ImportError:
FULLY_QUALIFIED_PATTERN_KEY_PREFIX = None
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
@@ -260,22 +251,14 @@ class PeftAdapterMixin:
# Cannot figure out rank from lora layers that don't have atleast 2 dimensions.
# Bias layers in LoRA only have a single dimension
if "lora_B" in key and val.ndim > 1:
# Support to handle cases where layer patterns are treated as full layer names
# was added later in PEFT. So, we handle it accordingly.
# TODO: when we fix the minimal PEFT version for Diffusers,
# we should remove `_maybe_adjust_config()`.
if FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
rank[f"{FULLY_QUALIFIED_PATTERN_KEY_PREFIX}{key}"] = val.shape[1]
else:
rank[key] = val.shape[1]
rank[key] = val.shape[1]
if network_alphas is not None and len(network_alphas) >= 1:
alpha_keys = [k for k in network_alphas.keys() if k.startswith(f"{prefix}.")]
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
if not FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
if "use_dora" in lora_config_kwargs:
if lora_config_kwargs["use_dora"]:
+1 -7
View File
@@ -360,17 +360,11 @@ class FromSingleFileMixin:
cache_dir = kwargs.pop("cache_dir", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
disable_mmap = kwargs.pop("disable_mmap", False)
is_legacy_loading = False
if not isinstance(torch_dtype, torch.dtype):
torch_dtype = torch.float32
logger.warning(
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
)
# We shouldn't allow configuring individual models components through a Pipeline creation method
# These model kwargs should be deprecated
scaling_factor = kwargs.get("scaling_factor", None)
+1 -7
View File
@@ -240,17 +240,11 @@ class FromOriginalModelMixin:
subfolder = kwargs.pop("subfolder", None)
revision = kwargs.pop("revision", None)
config_revision = kwargs.pop("config_revision", None)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
quantization_config = kwargs.pop("quantization_config", None)
device = kwargs.pop("device", None)
disable_mmap = kwargs.pop("disable_mmap", False)
if not isinstance(torch_dtype, torch.dtype):
torch_dtype = torch.float32
logger.warning(
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
)
if isinstance(pretrained_model_link_or_path_or_dict, dict):
checkpoint = pretrained_model_link_or_path_or_dict
else:
+9 -12
View File
@@ -213,9 +213,7 @@ class Attention(nn.Module):
self.norm_q = LpNorm(p=2, dim=-1, eps=eps)
self.norm_k = LpNorm(p=2, dim=-1, eps=eps)
else:
raise ValueError(
f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
)
raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")
if cross_attention_norm is None:
self.norm_cross = None
@@ -1410,7 +1408,7 @@ class JointAttnProcessor2_0:
def __init__(self):
if not hasattr(F, "scaled_dot_product_attention"):
raise ImportError("JointAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
def __call__(
self,
@@ -2780,8 +2778,9 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
# IP-adapter
ip_query = hidden_states_query_proj
ip_attn_output = torch.zeros_like(hidden_states)
ip_attn_output = None
# for ip-adapter
# TODO: support for multiple adapters
for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
):
@@ -2792,14 +2791,12 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
# the output of sdp = (batch, num_heads, seq_len, head_dim)
# TODO: add support for attn.scale when we move to Torch 2.1
current_ip_hidden_states = F.scaled_dot_product_attention(
ip_attn_output = F.scaled_dot_product_attention(
ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
)
current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
batch_size, -1, attn.heads * head_dim
)
current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
ip_attn_output += scale * current_ip_hidden_states
ip_attn_output = ip_attn_output.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
ip_attn_output = scale * ip_attn_output
ip_attn_output = ip_attn_output.to(ip_query.dtype)
return hidden_states, encoder_hidden_states, ip_attn_output
else:
@@ -40,48 +40,6 @@ class SD3ControlNetOutput(BaseOutput):
class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
r"""
ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
Parameters:
sample_size (`int`, defaults to `128`):
The width/height of the latents. This is fixed during training since it is used to learn a number of
position embeddings.
patch_size (`int`, defaults to `2`):
Patch size to turn the input data into small patches.
in_channels (`int`, defaults to `16`):
The number of latent channels in the input.
num_layers (`int`, defaults to `18`):
The number of layers of transformer blocks to use.
attention_head_dim (`int`, defaults to `64`):
The number of channels in each head.
num_attention_heads (`int`, defaults to `18`):
The number of heads to use for multi-head attention.
joint_attention_dim (`int`, defaults to `4096`):
The embedding dimension to use for joint text-image attention.
caption_projection_dim (`int`, defaults to `1152`):
The embedding dimension of caption embeddings.
pooled_projection_dim (`int`, defaults to `2048`):
The embedding dimension of pooled text projections.
out_channels (`int`, defaults to `16`):
The number of latent channels in the output.
pos_embed_max_size (`int`, defaults to `96`):
The maximum latent height/width of positional embeddings.
extra_conditioning_channels (`int`, defaults to `0`):
The number of extra channels to use for conditioning for patch embedding.
dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
The number of dual-stream transformer blocks to use.
qk_norm (`str`, *optional*, defaults to `None`):
The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
pos_embed_type (`str`, defaults to `"sincos"`):
The type of positional embedding to use. Choose between `"sincos"` and `None`.
use_pos_embed (`bool`, defaults to `True`):
Whether to use positional embeddings.
force_zeros_for_pooled_projection (`bool`, defaults to `True`):
Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
config value of the ControlNet model.
"""
_supports_gradient_checkpointing = True
@register_to_config
@@ -135,7 +93,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
JointTransformerBlock(
dim=self.inner_dim,
num_attention_heads=num_attention_heads,
attention_head_dim=attention_head_dim,
attention_head_dim=self.config.attention_head_dim,
context_pre_only=False,
qk_norm=qk_norm,
use_dual_attention=True if i in dual_attention_layers else False,
@@ -150,7 +108,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
SD3SingleTransformerBlock(
dim=self.inner_dim,
num_attention_heads=num_attention_heads,
attention_head_dim=attention_head_dim,
attention_head_dim=self.config.attention_head_dim,
)
for _ in range(num_layers)
]
@@ -339,28 +297,28 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
def forward(
self,
hidden_states: torch.Tensor,
hidden_states: torch.FloatTensor,
controlnet_cond: torch.Tensor,
conditioning_scale: float = 1.0,
encoder_hidden_states: torch.Tensor = None,
pooled_projections: torch.Tensor = None,
encoder_hidden_states: torch.FloatTensor = None,
pooled_projections: torch.FloatTensor = None,
timestep: torch.LongTensor = None,
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = True,
) -> Union[torch.Tensor, Transformer2DModelOutput]:
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
"""
The [`SD3Transformer2DModel`] forward method.
Args:
hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
Input `hidden_states`.
controlnet_cond (`torch.Tensor`):
The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
conditioning_scale (`float`, defaults to `1.0`):
The scale factor for ControlNet outputs.
encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
from the embeddings of input conditions.
timestep ( `torch.LongTensor`):
Used to indicate denoising step.
@@ -479,11 +437,11 @@ class SD3MultiControlNetModel(ModelMixin):
def forward(
self,
hidden_states: torch.Tensor,
hidden_states: torch.FloatTensor,
controlnet_cond: List[torch.tensor],
conditioning_scale: List[float],
pooled_projections: torch.Tensor,
encoder_hidden_states: torch.Tensor = None,
pooled_projections: torch.FloatTensor,
encoder_hidden_states: torch.FloatTensor = None,
timestep: torch.LongTensor = None,
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = True,
-5
View File
@@ -2583,11 +2583,6 @@ class MultiIPAdapterImageProjection(nn.Module):
super().__init__()
self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
@property
def num_ip_adapters(self) -> int:
"""Number of IP-Adapters loaded."""
return len(self.image_projection_layers)
def forward(self, image_embeds: List[torch.Tensor]):
projected_image_embeds = []
+1 -7
View File
@@ -866,7 +866,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
revision = kwargs.pop("revision", None)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
subfolder = kwargs.pop("subfolder", None)
device_map = kwargs.pop("device_map", None)
max_memory = kwargs.pop("max_memory", None)
@@ -879,12 +879,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
disable_mmap = kwargs.pop("disable_mmap", False)
if not isinstance(torch_dtype, torch.dtype):
torch_dtype = torch.float32
logger.warning(
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
)
allow_pickle = False
if use_safetensors is None:
use_safetensors = True
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
@@ -31,7 +32,7 @@ from ...models.attention_processor import (
)
from ...models.modeling_utils import ModelMixin
from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
from ...utils.import_utils import is_torch_npu_available
from ...utils.torch_utils import maybe_allow_in_graph
from ..cache_utils import CacheMixin
@@ -44,7 +45,20 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@maybe_allow_in_graph
class FluxSingleTransformerBlock(nn.Module):
def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
r"""
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
Reference: https://arxiv.org/abs/2403.03206
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
processing of `context` conditions.
"""
def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
super().__init__()
self.mlp_hidden_dim = int(dim * mlp_ratio)
@@ -54,15 +68,9 @@ class FluxSingleTransformerBlock(nn.Module):
self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
if is_torch_npu_available():
deprecation_message = (
"Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
"should be set explicitly using the `set_attn_processor` method."
)
deprecate("npu_processor", "0.34.0", deprecation_message)
processor = FluxAttnProcessor2_0_NPU()
else:
processor = FluxAttnProcessor2_0()
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
@@ -79,10 +87,13 @@ class FluxSingleTransformerBlock(nn.Module):
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
temb: torch.Tensor,
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
) -> torch.Tensor:
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
residual = hidden_states
norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
@@ -100,19 +111,47 @@ class FluxSingleTransformerBlock(nn.Module):
if hidden_states.dtype == torch.float16:
hidden_states = hidden_states.clip(-65504, 65504)
return hidden_states
encoder_hidden_states, hidden_states = hidden_states.split(
[encoder_hidden_states.size(1), hidden_states.size(1) - encoder_hidden_states.size(1)], dim=1
)
return hidden_states, encoder_hidden_states
@maybe_allow_in_graph
class FluxTransformerBlock(nn.Module):
r"""
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
Reference: https://arxiv.org/abs/2403.03206
Args:
dim (`int`):
The embedding dimension of the block.
num_attention_heads (`int`):
The number of attention heads to use.
attention_head_dim (`int`):
The number of dimensions to use for each attention head.
qk_norm (`str`, defaults to `"rms_norm"`):
The normalization to use for the query and key tensors.
eps (`float`, defaults to `1e-6`):
The epsilon value to use for the normalization.
"""
def __init__(
self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
):
super().__init__()
self.norm1 = AdaLayerNormZero(dim)
self.norm1_context = AdaLayerNormZero(dim)
if hasattr(F, "scaled_dot_product_attention"):
processor = FluxAttnProcessor2_0()
else:
raise ValueError(
"The current PyTorch version does not support the `scaled_dot_product_attention` function."
)
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
@@ -122,7 +161,7 @@ class FluxTransformerBlock(nn.Module):
out_dim=dim,
context_pre_only=False,
bias=True,
processor=FluxAttnProcessor2_0(),
processor=processor,
qk_norm=qk_norm,
eps=eps,
)
@@ -133,6 +172,10 @@ class FluxTransformerBlock(nn.Module):
self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
def forward(
self,
hidden_states: torch.Tensor,
@@ -187,7 +230,7 @@ class FluxTransformerBlock(nn.Module):
if encoder_hidden_states.dtype == torch.float16:
encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
return encoder_hidden_states, hidden_states
return hidden_states, encoder_hidden_states
class FluxTransformer2DModel(
@@ -480,7 +523,7 @@ class FluxTransformer2DModel(
for index_block, block in enumerate(self.transformer_blocks):
if torch.is_grad_enabled() and self.gradient_checkpointing:
encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
block,
hidden_states,
encoder_hidden_states,
@@ -489,7 +532,7 @@ class FluxTransformer2DModel(
)
else:
encoder_hidden_states, hidden_states = block(
hidden_states, encoder_hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=temb,
@@ -508,20 +551,21 @@ class FluxTransformer2DModel(
)
else:
hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
for index_block, block in enumerate(self.single_transformer_blocks):
if torch.is_grad_enabled() and self.gradient_checkpointing:
hidden_states = self._gradient_checkpointing_func(
hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
block,
hidden_states,
encoder_hidden_states,
temb,
image_rotary_emb,
)
else:
hidden_states = block(
hidden_states, encoder_hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=temb,
image_rotary_emb=image_rotary_emb,
joint_attention_kwargs=joint_attention_kwargs,
@@ -531,12 +575,7 @@ class FluxTransformer2DModel(
if controlnet_single_block_samples is not None:
interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
interval_control = int(np.ceil(interval_control))
hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+ controlnet_single_block_samples[index_block // interval_control]
)
hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
hidden_states = hidden_states + controlnet_single_block_samples[index_block // interval_control]
hidden_states = self.norm_out(hidden_states, temb)
output = self.proj_out(hidden_states)
@@ -15,6 +15,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, SD3Transformer2DLoadersMixin
@@ -38,6 +39,17 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@maybe_allow_in_graph
class SD3SingleTransformerBlock(nn.Module):
r"""
A Single Transformer block as part of the MMDiT architecture, used in Stable Diffusion 3 ControlNet.
Reference: https://arxiv.org/abs/2403.03206
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
"""
def __init__(
self,
dim: int,
@@ -47,13 +59,21 @@ class SD3SingleTransformerBlock(nn.Module):
super().__init__()
self.norm1 = AdaLayerNormZero(dim)
if hasattr(F, "scaled_dot_product_attention"):
processor = JointAttnProcessor2_0()
else:
raise ValueError(
"The current PyTorch version does not support the `scaled_dot_product_attention` function."
)
self.attn = Attention(
query_dim=dim,
dim_head=attention_head_dim,
heads=num_attention_heads,
out_dim=dim,
bias=True,
processor=JointAttnProcessor2_0(),
processor=processor,
eps=1e-6,
)
@@ -61,17 +81,23 @@ class SD3SingleTransformerBlock(nn.Module):
self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor):
# 1. Attention
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
attn_output = self.attn(hidden_states=norm_hidden_states, encoder_hidden_states=None)
# Attention.
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=None,
)
# Process attention outputs for the `hidden_states`.
attn_output = gate_msa.unsqueeze(1) * attn_output
hidden_states = hidden_states + attn_output
# 2. Feed Forward
norm_hidden_states = self.norm2(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
ff_output = self.ff(norm_hidden_states)
ff_output = gate_mlp.unsqueeze(1) * ff_output
hidden_states = hidden_states + ff_output
return hidden_states
@@ -81,40 +107,26 @@ class SD3Transformer2DModel(
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, SD3Transformer2DLoadersMixin
):
"""
The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
The Transformer model introduced in Stable Diffusion 3.
Reference: https://arxiv.org/abs/2403.03206
Parameters:
sample_size (`int`, defaults to `128`):
The width/height of the latents. This is fixed during training since it is used to learn a number of
position embeddings.
patch_size (`int`, defaults to `2`):
Patch size to turn the input data into small patches.
in_channels (`int`, defaults to `16`):
The number of latent channels in the input.
num_layers (`int`, defaults to `18`):
The number of layers of transformer blocks to use.
attention_head_dim (`int`, defaults to `64`):
The number of channels in each head.
num_attention_heads (`int`, defaults to `18`):
The number of heads to use for multi-head attention.
joint_attention_dim (`int`, defaults to `4096`):
The embedding dimension to use for joint text-image attention.
caption_projection_dim (`int`, defaults to `1152`):
The embedding dimension of caption embeddings.
pooled_projection_dim (`int`, defaults to `2048`):
The embedding dimension of pooled text projections.
out_channels (`int`, defaults to `16`):
The number of latent channels in the output.
pos_embed_max_size (`int`, defaults to `96`):
The maximum latent height/width of positional embeddings.
dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
The number of dual-stream transformer blocks to use.
qk_norm (`str`, *optional*, defaults to `None`):
The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
sample_size (`int`): The width of the latent images. This is fixed during training since
it is used to learn a number of position embeddings.
patch_size (`int`): Patch size to turn the input data into small patches.
in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
num_layers (`int`, *optional*, defaults to 18): The number of layers of Transformer blocks to use.
attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
caption_projection_dim (`int`): Number of dimensions to use when projecting the `encoder_hidden_states`.
pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
out_channels (`int`, defaults to 16): Number of output channels.
"""
_supports_gradient_checkpointing = True
_no_split_modules = ["JointTransformerBlock"]
_skip_layerwise_casting_patterns = ["pos_embed", "norm"]
@register_to_config
@@ -137,33 +149,36 @@ class SD3Transformer2DModel(
qk_norm: Optional[str] = None,
):
super().__init__()
self.out_channels = out_channels if out_channels is not None else in_channels
self.inner_dim = num_attention_heads * attention_head_dim
default_out_channels = in_channels
self.out_channels = out_channels if out_channels is not None else default_out_channels
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
self.pos_embed = PatchEmbed(
height=sample_size,
width=sample_size,
patch_size=patch_size,
in_channels=in_channels,
height=self.config.sample_size,
width=self.config.sample_size,
patch_size=self.config.patch_size,
in_channels=self.config.in_channels,
embed_dim=self.inner_dim,
pos_embed_max_size=pos_embed_max_size, # hard-code for now.
)
self.time_text_embed = CombinedTimestepTextProjEmbeddings(
embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
)
self.context_embedder = nn.Linear(joint_attention_dim, caption_projection_dim)
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
# `attention_head_dim` is doubled to account for the mixing.
# It needs to crafted when we get the actual checkpoints.
self.transformer_blocks = nn.ModuleList(
[
JointTransformerBlock(
dim=self.inner_dim,
num_attention_heads=num_attention_heads,
attention_head_dim=attention_head_dim,
num_attention_heads=self.config.num_attention_heads,
attention_head_dim=self.config.attention_head_dim,
context_pre_only=i == num_layers - 1,
qk_norm=qk_norm,
use_dual_attention=True if i in dual_attention_layers else False,
)
for i in range(num_layers)
for i in range(self.config.num_layers)
]
)
@@ -316,24 +331,24 @@ class SD3Transformer2DModel(
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor = None,
pooled_projections: torch.Tensor = None,
hidden_states: torch.FloatTensor,
encoder_hidden_states: torch.FloatTensor = None,
pooled_projections: torch.FloatTensor = None,
timestep: torch.LongTensor = None,
block_controlnet_hidden_states: List = None,
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = True,
skip_layers: Optional[List[int]] = None,
) -> Union[torch.Tensor, Transformer2DModelOutput]:
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
"""
The [`SD3Transformer2DModel`] forward method.
Args:
hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
Input `hidden_states`.
encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
Embeddings projected from the embeddings of input conditions.
timestep (`torch.LongTensor`):
Used to indicate denoising step.
-2
View File
@@ -261,7 +261,6 @@ else:
_import_structure["marigold"].extend(
[
"MarigoldDepthPipeline",
"MarigoldIntrinsicsPipeline",
"MarigoldNormalsPipeline",
]
)
@@ -604,7 +603,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from .lumina2 import Lumina2Text2ImgPipeline
from .marigold import (
MarigoldDepthPipeline,
MarigoldIntrinsicsPipeline,
MarigoldNormalsPipeline,
)
from .mochi import MochiPipeline
@@ -224,7 +224,7 @@ class AnimateDiffVideoToVideoPipeline(
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: Union[UNet2DConditionModel, UNetMotionModel],
unet: UNet2DConditionModel,
motion_adapter: MotionAdapter,
scheduler: Union[
DDIMScheduler,
@@ -246,7 +246,7 @@ class AnimateDiffVideoToVideoControlNetPipeline(
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: Union[UNet2DConditionModel, UNetMotionModel],
unet: UNet2DConditionModel,
motion_adapter: MotionAdapter,
controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
scheduler: Union[
-6
View File
@@ -34,10 +34,6 @@ from .controlnet import (
StableDiffusionXLControlNetUnionInpaintPipeline,
StableDiffusionXLControlNetUnionPipeline,
)
from .controlnet_sd3 import (
StableDiffusion3ControlNetInpaintingPipeline,
StableDiffusion3ControlNetPipeline,
)
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
from .flux import (
FluxControlImg2ImgPipeline,
@@ -124,7 +120,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionPipeline),
("stable-diffusion-3-controlnet", StableDiffusion3ControlNetPipeline),
("wuerstchen", WuerstchenCombinedPipeline),
("cascade", StableCascadeCombinedPipeline),
("lcm", LatentConsistencyModelPipeline),
@@ -183,7 +178,6 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionInpaintPipeline),
("stable-diffusion-3-controlnet", StableDiffusion3ControlNetInpaintingPipeline),
("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
("flux", FluxInpaintPipeline),
("flux-controlnet", FluxControlNetInpaintPipeline),
@@ -207,7 +207,7 @@ class StableDiffusionControlNetPipeline(
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "image"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
def __init__(
self,
@@ -1323,7 +1323,6 @@ class StableDiffusionControlNetPipeline(
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
image = callback_outputs.pop("image", image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -185,7 +185,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
def __init__(
self,
@@ -1294,7 +1294,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -184,7 +184,7 @@ class StableDiffusionControlNetInpaintPipeline(
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
def __init__(
self,
@@ -1476,7 +1476,6 @@ class StableDiffusionControlNetInpaintPipeline(
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -237,7 +237,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
"add_neg_time_ids",
"mask",
"masked_image_latents",
"control_image",
]
def __init__(
@@ -744,7 +743,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
if padding_mask_crop is not None:
if not isinstance(image, PIL.Image.Image):
raise ValueError(
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
)
if not isinstance(mask_image, PIL.Image.Image):
raise ValueError(
@@ -752,7 +751,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
f" {type(mask_image)}."
)
if output_type != "pil":
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
if prompt_embeds is not None and pooled_prompt_embeds is None:
raise ValueError(
@@ -1645,7 +1644,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
" `pipeline.unet` or your `mask_image` or `image` input."
)
elif num_channels_unet != 4:
@@ -1836,7 +1835,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -242,7 +242,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
"add_time_ids",
"negative_pooled_prompt_embeds",
"add_neg_time_ids",
"control_image",
]
def __init__(
@@ -1615,7 +1614,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
)
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -219,7 +219,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
"add_time_ids",
"mask",
"masked_image_latents",
"control_image",
]
def __init__(
@@ -727,7 +726,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
if padding_mask_crop is not None:
if not isinstance(image, PIL.Image.Image):
raise ValueError(
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
)
if not isinstance(mask_image, PIL.Image.Image):
raise ValueError(
@@ -735,7 +734,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
f" {type(mask_image)}."
)
if output_type != "pil":
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
if prompt_embeds is not None and pooled_prompt_embeds is None:
raise ValueError(
@@ -1744,7 +1743,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -252,7 +252,12 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
"feature_extractor",
"image_encoder",
]
_callback_tensor_inputs = ["latents", "prompt_embeds", "add_text_embeds", "add_time_ids", "control_image"]
_callback_tensor_inputs = [
"latents",
"prompt_embeds",
"add_text_embeds",
"add_time_ids",
]
def __init__(
self,
@@ -1557,7 +1562,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
control_image = callback_outputs.pop("control_image", control_image)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -232,8 +232,8 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
Tuple[HunyuanDiT2DControlNetModel],
HunyuanDiT2DMultiControlNetModel,
],
text_encoder_2: Optional[T5EncoderModel] = None,
tokenizer_2: Optional[MT5Tokenizer] = None,
text_encoder_2=T5EncoderModel,
tokenizer_2=MT5Tokenizer,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import (
BaseImageProcessor,
CLIPTextModelWithProjection,
CLIPTokenizer,
SiglipImageProcessor,
SiglipVisionModel,
PreTrainedModel,
T5EncoderModel,
T5TokenizerFast,
)
@@ -178,9 +178,9 @@ class StableDiffusion3ControlNetPipeline(
Provides additional conditioning to the `unet` during the denoising process. If you set multiple
ControlNets as a list, the outputs from each ControlNet are added together to create one combined
additional conditioning.
image_encoder (`SiglipVisionModel`, *optional*):
image_encoder (`PreTrainedModel`, *optional*):
Pre-trained Vision Model for IP Adapter.
feature_extractor (`SiglipImageProcessor`, *optional*):
feature_extractor (`BaseImageProcessor`, *optional*):
Image processor for IP Adapter.
"""
@@ -202,8 +202,8 @@ class StableDiffusion3ControlNetPipeline(
controlnet: Union[
SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
],
image_encoder: Optional[SiglipVisionModel] = None,
feature_extractor: Optional[SiglipImageProcessor] = None,
image_encoder: PreTrainedModel = None,
feature_extractor: BaseImageProcessor = None,
):
super().__init__()
if isinstance(controlnet, (list, tuple)):
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import (
BaseImageProcessor,
CLIPTextModelWithProjection,
CLIPTokenizer,
SiglipImageProcessor,
SiglipModel,
PreTrainedModel,
T5EncoderModel,
T5TokenizerFast,
)
@@ -223,8 +223,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(
controlnet: Union[
SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
],
image_encoder: SiglipModel = None,
feature_extractor: Optional[SiglipImageProcessor] = None,
image_encoder: PreTrainedModel = None,
feature_extractor: BaseImageProcessor = None,
):
super().__init__()
@@ -17,8 +17,6 @@ from typing import List, Optional, Tuple, Union
import torch
from ...models import UNet1DModel
from ...schedulers import SchedulerMixin
from ...utils import is_torch_xla_available, logging
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
@@ -51,7 +49,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
model_cpu_offload_seq = "unet"
def __init__(self, unet: UNet1DModel, scheduler: SchedulerMixin):
def __init__(self, unet, scheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
@@ -16,7 +16,6 @@ from typing import List, Optional, Tuple, Union
import torch
from ...models import UNet2DModel
from ...schedulers import DDIMScheduler
from ...utils import is_torch_xla_available
from ...utils.torch_utils import randn_tensor
@@ -48,7 +47,7 @@ class DDIMPipeline(DiffusionPipeline):
model_cpu_offload_seq = "unet"
def __init__(self, unet: UNet2DModel, scheduler: DDIMScheduler):
def __init__(self, unet, scheduler):
super().__init__()
# make sure scheduler can always be converted to DDIM
@@ -17,8 +17,6 @@ from typing import List, Optional, Tuple, Union
import torch
from ...models import UNet2DModel
from ...schedulers import DDPMScheduler
from ...utils import is_torch_xla_available
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -49,7 +47,7 @@ class DDPMPipeline(DiffusionPipeline):
model_cpu_offload_seq = "unet"
def __init__(self, unet: UNet2DModel, scheduler: DDPMScheduler):
def __init__(self, unet, scheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
@@ -91,7 +91,7 @@ class RePaintPipeline(DiffusionPipeline):
scheduler: RePaintScheduler
model_cpu_offload_seq = "unet"
def __init__(self, unet: UNet2DModel, scheduler: RePaintScheduler):
def __init__(self, unet, scheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
+7 -15
View File
@@ -405,28 +405,23 @@ class FluxPipeline(
if not isinstance(ip_adapter_image, list):
ip_adapter_image = [ip_adapter_image]
if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
if len(ip_adapter_image) != len(self.transformer.encoder_hid_proj.image_projection_layers):
raise ValueError(
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.transformer.encoder_hid_proj.image_projection_layers)} IP Adapters."
)
for single_ip_adapter_image in ip_adapter_image:
for single_ip_adapter_image, image_proj_layer in zip(
ip_adapter_image, self.transformer.encoder_hid_proj.image_projection_layers
):
single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
image_embeds.append(single_image_embeds[None, :])
else:
if not isinstance(ip_adapter_image_embeds, list):
ip_adapter_image_embeds = [ip_adapter_image_embeds]
if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
raise ValueError(
f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
)
for single_image_embeds in ip_adapter_image_embeds:
image_embeds.append(single_image_embeds)
ip_adapter_image_embeds = []
for single_image_embeds in image_embeds:
for i, single_image_embeds in enumerate(image_embeds):
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
single_image_embeds = single_image_embeds.to(device=device)
ip_adapter_image_embeds.append(single_image_embeds)
@@ -877,13 +872,10 @@ class FluxPipeline(
negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
):
negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
):
ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
if self.joint_attention_kwargs is None:
self._joint_attention_kwargs = {}
@@ -207,8 +207,8 @@ class HunyuanDiTPipeline(DiffusionPipeline):
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
requires_safety_checker: bool = True,
text_encoder_2: Optional[T5EncoderModel] = None,
tokenizer_2: Optional[MT5Tokenizer] = None,
text_encoder_2=T5EncoderModel,
tokenizer_2=MT5Tokenizer,
):
super().__init__()
@@ -20,7 +20,7 @@ import urllib.parse as ul
from typing import Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import GemmaPreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
from transformers import AutoModel, AutoTokenizer
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import VaeImageProcessor
@@ -144,10 +144,13 @@ class LuminaText2ImgPipeline(DiffusionPipeline):
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`GemmaPreTrainedModel`]):
Frozen Gemma text-encoder.
tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
Gemma tokenizer.
text_encoder ([`AutoModel`]):
Frozen text-encoder. Lumina-T2I uses
[T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the
[t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant.
tokenizer (`AutoModel`):
Tokenizer of class
[AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel).
transformer ([`Transformer2DModel`]):
A text conditioned `Transformer2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
@@ -182,8 +185,8 @@ class LuminaText2ImgPipeline(DiffusionPipeline):
transformer: LuminaNextDiT2DModel,
scheduler: FlowMatchEulerDiscreteScheduler,
vae: AutoencoderKL,
text_encoder: GemmaPreTrainedModel,
tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
text_encoder: AutoModel,
tokenizer: AutoTokenizer,
):
super().__init__()
@@ -17,7 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
from transformers import AutoModel, AutoTokenizer
from ...image_processor import VaeImageProcessor
from ...loaders import Lumina2LoraLoaderMixin
@@ -143,10 +143,13 @@ class Lumina2Text2ImgPipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`Gemma2PreTrainedModel`]):
Frozen Gemma2 text-encoder.
tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
Gemma tokenizer.
text_encoder ([`AutoModel`]):
Frozen text-encoder. Lumina-T2I uses
[T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the
[t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant.
tokenizer (`AutoModel`):
Tokenizer of class
[AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel).
transformer ([`Transformer2DModel`]):
A text conditioned `Transformer2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
@@ -162,8 +165,8 @@ class Lumina2Text2ImgPipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
transformer: Lumina2Transformer2DModel,
scheduler: FlowMatchEulerDiscreteScheduler,
vae: AutoencoderKL,
text_encoder: Gemma2PreTrainedModel,
tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
text_encoder: AutoModel,
tokenizer: AutoTokenizer,
):
super().__init__()
@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
else:
_import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
_import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
_import_structure["pipeline_marigold_intrinsics"] = ["MarigoldIntrinsicsOutput", "MarigoldIntrinsicsPipeline"]
_import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,7 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
else:
from .marigold_image_processing import MarigoldImageProcessor
from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
from .pipeline_marigold_intrinsics import MarigoldIntrinsicsOutput, MarigoldIntrinsicsPipeline
from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline
else:
@@ -1,22 +1,4 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# --------------------------------------------------------------------------
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import PIL
@@ -397,7 +379,7 @@ class MarigoldImageProcessor(ConfigMixin):
val_min: float = 0.0,
val_max: float = 1.0,
color_map: str = "Spectral",
) -> List[PIL.Image.Image]:
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
"""
Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.
@@ -409,7 +391,7 @@ class MarigoldImageProcessor(ConfigMixin):
color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
depth prediction into colored representation.
Returns: `List[PIL.Image.Image]` with depth maps visualization.
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
"""
if val_max <= val_min:
raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
@@ -454,7 +436,7 @@ class MarigoldImageProcessor(ConfigMixin):
depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
val_min: float = 0.0,
val_max: float = 1.0,
) -> List[PIL.Image.Image]:
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
def export_depth_to_16bit_png_one(img, idx=None):
prefix = "Depth" + (f"[{idx}]" if idx else "")
if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
@@ -496,7 +478,7 @@ class MarigoldImageProcessor(ConfigMixin):
flip_x: bool = False,
flip_y: bool = False,
flip_z: bool = False,
) -> List[PIL.Image.Image]:
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
"""
Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.
@@ -510,7 +492,7 @@ class MarigoldImageProcessor(ConfigMixin):
flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
Default direction is facing the observer.
Returns: `List[PIL.Image.Image]` with surface normals visualization.
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
"""
flip_vec = None
if any((flip_x, flip_y, flip_z)):
@@ -546,99 +528,6 @@ class MarigoldImageProcessor(ConfigMixin):
else:
raise ValueError(f"Unexpected input type: {type(normals)}")
@staticmethod
def visualize_intrinsics(
prediction: Union[
np.ndarray,
torch.Tensor,
List[np.ndarray],
List[torch.Tensor],
],
target_properties: Dict[str, Any],
color_map: Union[str, Dict[str, str]] = "binary",
) -> List[Dict[str, PIL.Image.Image]]:
"""
Visualizes intrinsic image decomposition, such as predictions of the `MarigoldIntrinsicsPipeline`.
Args:
prediction (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
Intrinsic image decomposition.
target_properties (`Dict[str, Any]`):
Decomposition properties. Expected entries: `target_names: List[str]` and a dictionary with keys
`prediction_space: str`, `sub_target_names: List[Union[str, Null]]` (must have 3 entries, null for
missing modalities), `up_to_scale: bool`, one for each target and sub-target.
color_map (`Union[str, Dict[str, str]]`, *optional*, defaults to `"Spectral"`):
Color map used to convert a single-channel predictions into colored representations. When a dictionary
is passed, each modality can be colored with its own color map.
Returns: `List[Dict[str, PIL.Image.Image]]` with intrinsic image decomposition visualization.
"""
if "target_names" not in target_properties:
raise ValueError("Missing `target_names` in target_properties")
if not isinstance(color_map, str) and not (
isinstance(color_map, dict)
and all(isinstance(k, str) and isinstance(v, str) for k, v in color_map.items())
):
raise ValueError("`color_map` must be a string or a dictionary of strings")
n_targets = len(target_properties["target_names"])
def visualize_targets_one(images, idx=None):
# img: [T, 3, H, W]
out = {}
for target_name, img in zip(target_properties["target_names"], images):
img = img.permute(1, 2, 0) # [H, W, 3]
prediction_space = target_properties[target_name].get("prediction_space", "srgb")
if prediction_space == "stack":
sub_target_names = target_properties[target_name]["sub_target_names"]
if len(sub_target_names) != 3 or any(
not (isinstance(s, str) or s is None) for s in sub_target_names
):
raise ValueError(f"Unexpected target sub-names {sub_target_names} in {target_name}")
for i, sub_target_name in enumerate(sub_target_names):
if sub_target_name is None:
continue
sub_img = img[:, :, i]
sub_prediction_space = target_properties[sub_target_name].get("prediction_space", "srgb")
if sub_prediction_space == "linear":
sub_up_to_scale = target_properties[sub_target_name].get("up_to_scale", False)
if sub_up_to_scale:
sub_img = sub_img / max(sub_img.max().item(), 1e-6)
sub_img = sub_img ** (1 / 2.2)
cmap_name = (
color_map if isinstance(color_map, str) else color_map.get(sub_target_name, "binary")
)
sub_img = MarigoldImageProcessor.colormap(sub_img, cmap=cmap_name, bytes=True)
sub_img = PIL.Image.fromarray(sub_img.cpu().numpy())
out[sub_target_name] = sub_img
elif prediction_space == "linear":
up_to_scale = target_properties[target_name].get("up_to_scale", False)
if up_to_scale:
img = img / max(img.max().item(), 1e-6)
img = img ** (1 / 2.2)
elif prediction_space == "srgb":
pass
img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
img = PIL.Image.fromarray(img)
out[target_name] = img
return out
if prediction is None or isinstance(prediction, list) and any(o is None for o in prediction):
raise ValueError("Input prediction is `None`")
if isinstance(prediction, (np.ndarray, torch.Tensor)):
prediction = MarigoldImageProcessor.expand_tensor_or_array(prediction)
if isinstance(prediction, np.ndarray):
prediction = MarigoldImageProcessor.numpy_to_pt(prediction) # [N*T,3,H,W]
if not (prediction.ndim == 4 and prediction.shape[1] == 3 and prediction.shape[0] % n_targets == 0):
raise ValueError(f"Unexpected input shape={prediction.shape}, expecting [N*T,3,H,W].")
N_T, _, H, W = prediction.shape
N = N_T // n_targets
prediction = prediction.reshape(N, n_targets, 3, H, W)
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
elif isinstance(prediction, list):
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
else:
raise ValueError(f"Unexpected input type: {type(prediction)}")
@staticmethod
def visualize_uncertainty(
uncertainty: Union[
@@ -648,10 +537,9 @@ class MarigoldImageProcessor(ConfigMixin):
List[torch.Tensor],
],
saturation_percentile=95,
) -> List[PIL.Image.Image]:
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
"""
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline`, `MarigoldNormalsPipeline`, or
`MarigoldIntrinsicsPipeline`.
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.
Args:
uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
@@ -659,15 +547,14 @@ class MarigoldImageProcessor(ConfigMixin):
saturation_percentile (`int`, *optional*, defaults to `95`):
Specifies the percentile uncertainty value visualized with maximum intensity.
Returns: `List[PIL.Image.Image]` with uncertainty visualization.
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
"""
def visualize_uncertainty_one(img, idx=None):
prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
if img.min() < 0:
raise ValueError(f"{prefix}: unexpected data range, min={img.min()}.")
img = img.permute(1, 2, 0) # [H,W,C]
img = img.squeeze(2).cpu().numpy() # [H,W] or [H,W,3]
raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
img = img.squeeze(0).cpu().numpy()
saturation_value = np.percentile(img, saturation_percentile)
img = np.clip(img * 255 / saturation_value, 0, 255)
img = img.astype(np.uint8)
@@ -679,9 +566,9 @@ class MarigoldImageProcessor(ConfigMixin):
if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
if isinstance(uncertainty, np.ndarray):
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,C,H,W]
if not (uncertainty.ndim == 4 and uncertainty.shape[1] in (1, 3)):
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,C,H,W] with C in (1,3).")
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,1,H,W]
if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
elif isinstance(uncertainty, list):
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
@@ -1,5 +1,5 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# Marigold project website: https://marigoldmonodepth.github.io
# --------------------------------------------------------------------------
from dataclasses import dataclass
from functools import partial
@@ -64,7 +64,7 @@ Examples:
>>> import torch
>>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
... "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
... "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
@@ -86,12 +86,11 @@ class MarigoldDepthOutput(BaseOutput):
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
\times width$, regardless of whether the images were passed as a 4D array or a list.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
for `np.ndarray`.
\times 1 \times height \times width$.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
@@ -209,11 +208,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
output_type: str,
output_uncertainty: bool,
) -> int:
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
if actual_vae_scale_factor != self.vae_scale_factor:
raise ValueError(
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
)
if num_inference_steps is None:
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
if num_inference_steps < 1:
@@ -326,7 +320,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
return num_images
@torch.compiler.disable
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
if not hasattr(self, "_progress_bar_config"):
self._progress_bar_config = {}
@@ -377,9 +370,11 @@ class MarigoldDepthPipeline(DiffusionPipeline):
same width and height.
num_inference_steps (`int`, *optional*, defaults to `None`):
Number of denoising diffusion steps during inference. The default value `None` results in automatic
selection.
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
for Marigold-LCM models.
ensemble_size (`int`, defaults to `1`):
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
faster inference.
processing_resolution (`int`, *optional*, defaults to `None`):
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
produces crisper predictions, but may also lead to the overall loss of global context. The default
@@ -491,7 +486,9 @@ class MarigoldDepthPipeline(DiffusionPipeline):
# `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
# into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
# reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
# code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
# code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
# as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
# noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
# dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
# Model invocation: self.vae.encoder.
image_latent, pred_latent = self.prepare_latents(
@@ -736,7 +733,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
param = init_s.cpu().numpy()
else:
raise ValueError("Unrecognized alignment.")
param = param.astype(np.float64)
return param
@@ -779,7 +775,7 @@ class MarigoldDepthPipeline(DiffusionPipeline):
if regularizer_strength > 0:
prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
err_near = prediction.min().abs().item()
err_near = (0.0 - prediction.min()).abs().item()
err_far = (1.0 - prediction.max()).abs().item()
cost += (err_near + err_far) * regularizer_strength
@@ -1,721 +0,0 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# --------------------------------------------------------------------------
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from PIL import Image
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput
from ...models import (
AutoencoderKL,
UNet2DConditionModel,
)
from ...schedulers import (
DDIMScheduler,
LCMScheduler,
)
from ...utils import (
BaseOutput,
is_torch_xla_available,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline
from .marigold_image_processing import MarigoldImageProcessor
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
XLA_AVAILABLE = True
else:
XLA_AVAILABLE = False
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import diffusers
>>> import torch
>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
... "prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
>>> intrinsics = pipe(image)
>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
>>> vis[0]["albedo"].save("einstein_albedo.png")
>>> vis[0]["roughness"].save("einstein_roughness.png")
>>> vis[0]["metallicity"].save("einstein_metallicity.png")
```
```py
>>> import diffusers
>>> import torch
>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
... "prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
>>> intrinsics = pipe(image)
>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
>>> vis[0]["albedo"].save("einstein_albedo.png")
>>> vis[0]["shading"].save("einstein_shading.png")
>>> vis[0]["residual"].save("einstein_residual.png")
```
"""
@dataclass
class MarigoldIntrinsicsOutput(BaseOutput):
"""
Output class for Marigold Intrinsic Image Decomposition pipeline.
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
\times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
\times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
the intrinsic image decomposition.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
height \times width \times 3$ for `np.ndarray`.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
"""
prediction: Union[np.ndarray, torch.Tensor]
uncertainty: Union[None, np.ndarray, torch.Tensor]
latent: Union[None, torch.Tensor]
class MarigoldIntrinsicsPipeline(DiffusionPipeline):
"""
Pipeline for Intrinsic Image Decomposition (IID) using the Marigold method:
https://marigoldcomputervision.github.io.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Args:
unet (`UNet2DConditionModel`):
Conditional U-Net to denoise the targets latent, conditioned on image latent.
vae (`AutoencoderKL`):
Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
representations.
scheduler (`DDIMScheduler` or `LCMScheduler`):
A scheduler to be used in combination with `unet` to denoise the encoded image latents.
text_encoder (`CLIPTextModel`):
Text-encoder, for empty text embedding.
tokenizer (`CLIPTokenizer`):
CLIP tokenizer.
prediction_type (`str`, *optional*):
Type of predictions made by the model.
target_properties (`Dict[str, Any]`, *optional*):
Properties of the predicted modalities, such as `target_names`, a `List[str]` used to define the number,
order and names of the predicted modalities, and any other metadata that may be required to interpret the
predictions.
default_denoising_steps (`int`, *optional*):
The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
quality with the given model. This value must be set in the model config. When the pipeline is called
without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
reasonable results with various model flavors compatible with the pipeline, such as those relying on very
short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
default_processing_resolution (`int`, *optional*):
The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
default value is used. This is required to ensure reasonable results with various model flavors trained
with varying optimal processing resolution values.
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
supported_prediction_types = ("intrinsics",)
def __init__(
self,
unet: UNet2DConditionModel,
vae: AutoencoderKL,
scheduler: Union[DDIMScheduler, LCMScheduler],
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
prediction_type: Optional[str] = None,
target_properties: Optional[Dict[str, Any]] = None,
default_denoising_steps: Optional[int] = None,
default_processing_resolution: Optional[int] = None,
):
super().__init__()
if prediction_type not in self.supported_prediction_types:
logger.warning(
f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
f"{self.supported_prediction_types}."
)
self.register_modules(
unet=unet,
vae=vae,
scheduler=scheduler,
text_encoder=text_encoder,
tokenizer=tokenizer,
)
self.register_to_config(
prediction_type=prediction_type,
target_properties=target_properties,
default_denoising_steps=default_denoising_steps,
default_processing_resolution=default_processing_resolution,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
self.target_properties = target_properties
self.default_denoising_steps = default_denoising_steps
self.default_processing_resolution = default_processing_resolution
self.empty_text_embedding = None
self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
@property
def n_targets(self):
return self.unet.config.out_channels // self.vae.config.latent_channels
def check_inputs(
self,
image: PipelineImageInput,
num_inference_steps: int,
ensemble_size: int,
processing_resolution: int,
resample_method_input: str,
resample_method_output: str,
batch_size: int,
ensembling_kwargs: Optional[Dict[str, Any]],
latents: Optional[torch.Tensor],
generator: Optional[Union[torch.Generator, List[torch.Generator]]],
output_type: str,
output_uncertainty: bool,
) -> int:
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
if actual_vae_scale_factor != self.vae_scale_factor:
raise ValueError(
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
)
if num_inference_steps is None:
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
if num_inference_steps < 1:
raise ValueError("`num_inference_steps` must be positive.")
if ensemble_size < 1:
raise ValueError("`ensemble_size` must be positive.")
if ensemble_size == 2:
logger.warning(
"`ensemble_size` == 2 results are similar to no ensembling (1); "
"consider increasing the value to at least 3."
)
if ensemble_size == 1 and output_uncertainty:
raise ValueError(
"Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
"greater than 1."
)
if processing_resolution is None:
raise ValueError(
"`processing_resolution` is not specified and could not be resolved from the model config."
)
if processing_resolution < 0:
raise ValueError(
"`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
"downsampled processing."
)
if processing_resolution % self.vae_scale_factor != 0:
raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
raise ValueError(
"`resample_method_input` takes string values compatible with PIL library: "
"nearest, nearest-exact, bilinear, bicubic, area."
)
if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
raise ValueError(
"`resample_method_output` takes string values compatible with PIL library: "
"nearest, nearest-exact, bilinear, bicubic, area."
)
if batch_size < 1:
raise ValueError("`batch_size` must be positive.")
if output_type not in ["pt", "np"]:
raise ValueError("`output_type` must be one of `pt` or `np`.")
if latents is not None and generator is not None:
raise ValueError("`latents` and `generator` cannot be used together.")
if ensembling_kwargs is not None:
if not isinstance(ensembling_kwargs, dict):
raise ValueError("`ensembling_kwargs` must be a dictionary.")
if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("median", "mean"):
raise ValueError("`ensembling_kwargs['reduction']` can be either `'median'` or `'mean'`.")
# image checks
num_images = 0
W, H = None, None
if not isinstance(image, list):
image = [image]
for i, img in enumerate(image):
if isinstance(img, np.ndarray) or torch.is_tensor(img):
if img.ndim not in (2, 3, 4):
raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
H_i, W_i = img.shape[-2:]
N_i = 1
if img.ndim == 4:
N_i = img.shape[0]
elif isinstance(img, Image.Image):
W_i, H_i = img.size
N_i = 1
else:
raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
if W is None:
W, H = W_i, H_i
elif (W, H) != (W_i, H_i):
raise ValueError(
f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
)
num_images += N_i
# latents checks
if latents is not None:
if not torch.is_tensor(latents):
raise ValueError("`latents` must be a torch.Tensor.")
if latents.dim() != 4:
raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
if processing_resolution > 0:
max_orig = max(H, W)
new_H = H * processing_resolution // max_orig
new_W = W * processing_resolution // max_orig
if new_H == 0 or new_W == 0:
raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
W, H = new_W, new_H
w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
shape_expected = (num_images * ensemble_size, self.unet.config.out_channels, h, w)
if latents.shape != shape_expected:
raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
# generator checks
if generator is not None:
if isinstance(generator, list):
if len(generator) != num_images * ensemble_size:
raise ValueError(
"The number of generators must match the total number of ensemble members for all input images."
)
if not all(g.device.type == generator[0].device.type for g in generator):
raise ValueError("`generator` device placement is not consistent in the list.")
elif not isinstance(generator, torch.Generator):
raise ValueError(f"Unsupported generator type: {type(generator)}.")
return num_images
@torch.compiler.disable
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
if not hasattr(self, "_progress_bar_config"):
self._progress_bar_config = {}
elif not isinstance(self._progress_bar_config, dict):
raise ValueError(
f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
)
progress_bar_config = dict(**self._progress_bar_config)
progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
if iterable is not None:
return tqdm(iterable, **progress_bar_config)
elif total is not None:
return tqdm(total=total, **progress_bar_config)
else:
raise ValueError("Either `total` or `iterable` has to be defined.")
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image: PipelineImageInput,
num_inference_steps: Optional[int] = None,
ensemble_size: int = 1,
processing_resolution: Optional[int] = None,
match_input_resolution: bool = True,
resample_method_input: str = "bilinear",
resample_method_output: str = "bilinear",
batch_size: int = 1,
ensembling_kwargs: Optional[Dict[str, Any]] = None,
latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
output_type: str = "np",
output_uncertainty: bool = False,
output_latent: bool = False,
return_dict: bool = True,
):
"""
Function invoked when calling the pipeline.
Args:
image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
`List[torch.Tensor]`: An input image or images used as an input for the intrinsic decomposition task.
For arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is
possible by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
same width and height.
num_inference_steps (`int`, *optional*, defaults to `None`):
Number of denoising diffusion steps during inference. The default value `None` results in automatic
selection.
ensemble_size (`int`, defaults to `1`):
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
processing_resolution (`int`, *optional*, defaults to `None`):
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
produces crisper predictions, but may also lead to the overall loss of global context. The default
value `None` resolves to the optimal value from the model config.
match_input_resolution (`bool`, *optional*, defaults to `True`):
When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
side of the output will equal to `processing_resolution`.
resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
Resampling method used to resize input images to `processing_resolution`. The accepted values are:
`"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
Resampling method used to resize output predictions to match the input resolution. The accepted values
are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
batch_size (`int`, *optional*, defaults to `1`):
Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
ensembling_kwargs (`dict`, *optional*, defaults to `None`)
Extra dictionary with arguments for precise ensembling control. The following options are available:
- reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
every pixel location, can be either `"median"` or `"mean"`.
latents (`torch.Tensor`, *optional*, defaults to `None`):
Latent noise tensors to replace the random initialization. These can be taken from the previous
function call's output.
generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
Random number generator object to ensure reproducibility.
output_type (`str`, *optional*, defaults to `"np"`):
Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
values are: `"np"` (numpy array) or `"pt"` (torch tensor).
output_uncertainty (`bool`, *optional*, defaults to `False`):
When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
the `ensemble_size` argument is set to a value above 2.
output_latent (`bool`, *optional*, defaults to `False`):
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
`latents` argument.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.marigold.MarigoldIntrinsicsOutput`] instead of a plain tuple.
Examples:
Returns:
[`~pipelines.marigold.MarigoldIntrinsicsOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.marigold.MarigoldIntrinsicsOutput`] is returned, otherwise a
`tuple` is returned where the first element is the prediction, the second element is the uncertainty
(or `None`), and the third is the latent (or `None`).
"""
# 0. Resolving variables.
device = self._execution_device
dtype = self.dtype
# Model-specific optimal default values leading to fast and reasonable results.
if num_inference_steps is None:
num_inference_steps = self.default_denoising_steps
if processing_resolution is None:
processing_resolution = self.default_processing_resolution
# 1. Check inputs.
num_images = self.check_inputs(
image,
num_inference_steps,
ensemble_size,
processing_resolution,
resample_method_input,
resample_method_output,
batch_size,
ensembling_kwargs,
latents,
generator,
output_type,
output_uncertainty,
)
# 2. Prepare empty text conditioning.
# Model invocation: self.tokenizer, self.text_encoder.
if self.empty_text_embedding is None:
prompt = ""
text_inputs = self.tokenizer(
prompt,
padding="do_not_pad",
max_length=self.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids.to(device)
self.empty_text_embedding = self.text_encoder(text_input_ids)[0] # [1,2,1024]
# 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
# optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
# `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
# divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
# of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
# operation and leads to the most reasonable results. Using the native image resolution or any other processing
# resolution can lead to loss of either fine details or global context in the output predictions.
image, padding, original_resolution = self.image_processor.preprocess(
image, processing_resolution, resample_method_input, device, dtype
) # [N,3,PPH,PPW]
# 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
# ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
# Latents of each such predictions across all input images and all ensemble members are represented in the
# `pred_latent` variable. The variable `image_latent` contains each input image encoded into latent space and
# replicated `E` times. The variable `pred_latent` contains latents initialization, where the latent space is
# replicated `T` times relative to the single latent space of `image_latent`, where `T` is the number of the
# predicted targets. The latents can be either generated (see `generator` to ensure reproducibility), or passed
# explicitly via the `latents` argument. The latter can be set outside the pipeline code. This behavior can be
# achieved by setting the `output_latent` argument to `True`. The latent space dimensions are `(h, w)`. Encoding
# into latent space happens in batches of size `batch_size`.
# Model invocation: self.vae.encoder.
image_latent, pred_latent = self.prepare_latents(
image, latents, generator, ensemble_size, batch_size
) # [N*E,4,h,w], [N*E,T*4,h,w]
del image
batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
batch_size, 1, 1
) # [B,1024,2]
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
# outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
# `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
# model.
# Model invocation: self.unet.
pred_latents = []
for i in self.progress_bar(
range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
):
batch_image_latent = image_latent[i : i + batch_size] # [B,4,h,w]
batch_pred_latent = pred_latent[i : i + batch_size] # [B,T*4,h,w]
effective_batch_size = batch_image_latent.shape[0]
text = batch_empty_text_embedding[:effective_batch_size] # [B,2,1024]
self.scheduler.set_timesteps(num_inference_steps, device=device)
for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1) # [B,(1+T)*4,h,w]
noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0] # [B,T*4,h,w]
batch_pred_latent = self.scheduler.step(
noise, t, batch_pred_latent, generator=generator
).prev_sample # [B,T*4,h,w]
if XLA_AVAILABLE:
xm.mark_step()
pred_latents.append(batch_pred_latent)
pred_latent = torch.cat(pred_latents, dim=0) # [N*E,T*4,h,w]
del (
pred_latents,
image_latent,
batch_empty_text_embedding,
batch_image_latent,
batch_pred_latent,
text,
batch_latent,
noise,
)
# 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
# which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
# Model invocation: self.vae.decoder.
pred_latent_for_decoding = pred_latent.reshape(
num_images * ensemble_size * self.n_targets, self.vae.config.latent_channels, *pred_latent.shape[2:]
) # [N*E*T,4,PPH,PPW]
prediction = torch.cat(
[
self.decode_prediction(pred_latent_for_decoding[i : i + batch_size])
for i in range(0, pred_latent_for_decoding.shape[0], batch_size)
],
dim=0,
) # [N*E*T,3,PPH,PPW]
del pred_latent_for_decoding
if not output_latent:
pred_latent = None
# 7. Remove padding. The output shape is (PH, PW).
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E*T,3,PH,PW]
# 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N*T`
# groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
# `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
# each group independently, it stacks them respectively into batches of `N*T` almost final predictions and
# uncertainty maps.
uncertainty = None
if ensemble_size > 1:
prediction = prediction.reshape(
num_images, ensemble_size, self.n_targets, *prediction.shape[1:]
) # [N,E,T,3,PH,PW]
prediction = [
self.ensemble_intrinsics(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
for i in range(num_images)
] # [ [[T,3,PH,PW], [T,3,PH,PW]], ... ]
prediction, uncertainty = zip(*prediction) # [[T,3,PH,PW], ... ], [[T,3,PH,PW], ... ]
prediction = torch.cat(prediction, dim=0) # [N*T,3,PH,PW]
if output_uncertainty:
uncertainty = torch.cat(uncertainty, dim=0) # [N*T,3,PH,PW]
else:
uncertainty = None
# 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
if match_input_resolution:
prediction = self.image_processor.resize_antialias(
prediction, original_resolution, resample_method_output, is_aa=False
) # [N*T,3,H,W]
if uncertainty is not None and output_uncertainty:
uncertainty = self.image_processor.resize_antialias(
uncertainty, original_resolution, resample_method_output, is_aa=False
) # [N*T,1,H,W]
# 10. Prepare the final outputs.
if output_type == "np":
prediction = self.image_processor.pt_to_numpy(prediction) # [N*T,H,W,3]
if uncertainty is not None and output_uncertainty:
uncertainty = self.image_processor.pt_to_numpy(uncertainty) # [N*T,H,W,3]
# 11. Offload all models
self.maybe_free_model_hooks()
if not return_dict:
return (prediction, uncertainty, pred_latent)
return MarigoldIntrinsicsOutput(
prediction=prediction,
uncertainty=uncertainty,
latent=pred_latent,
)
def prepare_latents(
self,
image: torch.Tensor,
latents: Optional[torch.Tensor],
generator: Optional[torch.Generator],
ensemble_size: int,
batch_size: int,
) -> Tuple[torch.Tensor, torch.Tensor]:
def retrieve_latents(encoder_output):
if hasattr(encoder_output, "latent_dist"):
return encoder_output.latent_dist.mode()
elif hasattr(encoder_output, "latents"):
return encoder_output.latents
else:
raise AttributeError("Could not access latents of provided encoder_output")
image_latent = torch.cat(
[
retrieve_latents(self.vae.encode(image[i : i + batch_size]))
for i in range(0, image.shape[0], batch_size)
],
dim=0,
) # [N,4,h,w]
image_latent = image_latent * self.vae.config.scaling_factor
image_latent = image_latent.repeat_interleave(ensemble_size, dim=0) # [N*E,4,h,w]
N_E, C, H, W = image_latent.shape
pred_latent = latents
if pred_latent is None:
pred_latent = randn_tensor(
(N_E, self.n_targets * C, H, W),
generator=generator,
device=image_latent.device,
dtype=image_latent.dtype,
) # [N*E,T*4,h,w]
return image_latent, pred_latent
def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
raise ValueError(
f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
)
prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0] # [B,3,H,W]
prediction = torch.clip(prediction, -1.0, 1.0) # [B,3,H,W]
prediction = (prediction + 1.0) / 2.0
return prediction # [B,3,H,W]
@staticmethod
def ensemble_intrinsics(
targets: torch.Tensor,
output_uncertainty: bool = False,
reduction: str = "median",
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
Ensembles the intrinsic decomposition represented by the `targets` tensor with expected shape `(B, T, 3, H,
W)`, where B is the number of ensemble members for a given prediction of size `(H x W)`, and T is the number of
predicted targets.
Args:
targets (`torch.Tensor`):
Input ensemble of intrinsic image decomposition maps.
output_uncertainty (`bool`, *optional*, defaults to `False`):
Whether to output uncertainty map.
reduction (`str`, *optional*, defaults to `"mean"`):
Reduction method used to ensemble aligned predictions. The accepted values are: `"median"` and
`"mean"`.
Returns:
A tensor of aligned and ensembled intrinsic decomposition maps with shape `(T, 3, H, W)` and optionally a
tensor of uncertainties of shape `(T, 3, H, W)`.
"""
if targets.dim() != 5 or targets.shape[2] != 3:
raise ValueError(f"Expecting 4D tensor of shape [B,T,3,H,W]; got {targets.shape}.")
if reduction not in ("median", "mean"):
raise ValueError(f"Unrecognized reduction method: {reduction}.")
B, T, _, H, W = targets.shape
uncertainty = None
if reduction == "mean":
prediction = torch.mean(targets, dim=0) # [T,3,H,W]
if output_uncertainty:
uncertainty = torch.std(targets, dim=0) # [T,3,H,W]
elif reduction == "median":
prediction = torch.median(targets, dim=0, keepdim=True).values # [1,T,3,H,W]
if output_uncertainty:
uncertainty = torch.abs(targets - prediction) # [B,T,3,H,W]
uncertainty = torch.median(uncertainty, dim=0).values # [T,3,H,W]
prediction = prediction.squeeze(0) # [T,3,H,W]
else:
raise ValueError(f"Unrecognized reduction method: {reduction}.")
return prediction, uncertainty
@@ -1,5 +1,5 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# Marigold project website: https://marigoldmonodepth.github.io
# --------------------------------------------------------------------------
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
@@ -62,7 +62,7 @@ Examples:
>>> import torch
>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
... "prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
... "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
... ).to("cuda")
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
@@ -81,12 +81,11 @@ class MarigoldNormalsOutput(BaseOutput):
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
\times width$, regardless of whether the images were passed as a 4D array or a list.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
for `np.ndarray`.
\times 1 \times height \times width$.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
@@ -165,7 +164,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
tokenizer=tokenizer,
)
self.register_to_config(
prediction_type=prediction_type,
use_full_z_range=use_full_z_range,
default_denoising_steps=default_denoising_steps,
default_processing_resolution=default_processing_resolution,
@@ -196,11 +194,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
output_type: str,
output_uncertainty: bool,
) -> int:
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
if actual_vae_scale_factor != self.vae_scale_factor:
raise ValueError(
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
)
if num_inference_steps is None:
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
if num_inference_steps < 1:
@@ -311,7 +304,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
return num_images
@torch.compiler.disable
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
if not hasattr(self, "_progress_bar_config"):
self._progress_bar_config = {}
@@ -362,9 +354,11 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
same width and height.
num_inference_steps (`int`, *optional*, defaults to `None`):
Number of denoising diffusion steps during inference. The default value `None` results in automatic
selection.
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
for Marigold-LCM models.
ensemble_size (`int`, defaults to `1`):
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
faster inference.
processing_resolution (`int`, *optional*, defaults to `None`):
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
produces crisper predictions, but may also lead to the overall loss of global context. The default
@@ -400,7 +394,7 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
`latents` argument.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.marigold.MarigoldNormalsOutput`] instead of a plain tuple.
Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
Examples:
@@ -468,7 +462,9 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
# `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
# into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
# reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
# code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
# code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
# as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
# noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
# dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
# Model invocation: self.vae.encoder.
image_latent, pred_latent = self.prepare_latents(
@@ -20,7 +20,7 @@ import warnings
from typing import Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import PixArtImageProcessor
@@ -160,8 +160,8 @@ class SanaPAGPipeline(DiffusionPipeline, PAGMixin):
def __init__(
self,
tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
text_encoder: Gemma2PreTrainedModel,
tokenizer: AutoTokenizer,
text_encoder: AutoModelForCausalLM,
vae: AutoencoderDC,
transformer: SanaTransformer2DModel,
scheduler: FlowMatchEulerDiscreteScheduler,
+25 -28
View File
@@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import fnmatch
import importlib
import inspect
@@ -54,8 +55,6 @@ from ..utils import (
DEPRECATED_REVISION_ARGS,
BaseOutput,
PushToHubMixin,
_get_detailed_type,
_is_valid_type,
is_accelerate_available,
is_accelerate_version,
is_torch_npu_available,
@@ -685,7 +684,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
token = kwargs.pop("token", None)
revision = kwargs.pop("revision", None)
from_flax = kwargs.pop("from_flax", False)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
custom_pipeline = kwargs.pop("custom_pipeline", None)
custom_revision = kwargs.pop("custom_revision", None)
provider = kwargs.pop("provider", None)
@@ -702,12 +701,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
use_onnx = kwargs.pop("use_onnx", None)
load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
if not isinstance(torch_dtype, torch.dtype):
torch_dtype = torch.float32
logger.warning(
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
)
if low_cpu_mem_usage and not is_accelerate_available():
low_cpu_mem_usage = False
logger.warning(
@@ -883,6 +876,26 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
for key in init_dict.keys():
if key not in passed_class_obj:
continue
if "scheduler" in key:
continue
class_obj = passed_class_obj[key]
_expected_class_types = []
for expected_type in expected_types[key]:
if isinstance(expected_type, enum.EnumMeta):
_expected_class_types.extend(expected_type.__members__.keys())
else:
_expected_class_types.append(expected_type.__name__)
_is_valid_type = class_obj.__class__.__name__ in _expected_class_types
if not _is_valid_type:
logger.warning(
f"Expected types for {key}: {_expected_class_types}, got {class_obj.__class__.__name__}."
)
# Special case: safety_checker must be loaded separately when using `from_flax`
if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj:
raise NotImplementedError(
@@ -1002,26 +1015,10 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
)
# 10. Type checking init arguments
for kw, arg in init_kwargs.items():
# Too complex to validate with type annotation alone
if "scheduler" in kw:
continue
# Many tokenizer annotations don't include its "Fast" variant, so skip this
# e.g T5Tokenizer but not T5TokenizerFast
elif "tokenizer" in kw:
continue
elif (
arg is not None # Skip if None
and not expected_types[kw] == (inspect.Signature.empty,) # Skip if no type annotations
and not _is_valid_type(arg, expected_types[kw]) # Check type
):
logger.warning(f"Expected types for {kw}: {expected_types[kw]}, got {_get_detailed_type(arg)}.")
# 11. Instantiate the pipeline
# 10. Instantiate the pipeline
model = pipeline_class(**init_kwargs)
# 12. Save where the model was instantiated from
# 11. Save where the model was instantiated from
model.register_to_config(_name_or_path=pretrained_model_name_or_path)
if device_map is not None:
setattr(model, "hf_device_map", final_device_map)
@@ -1832,7 +1829,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
"""
original_config = dict(pipeline.config)
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
torch_dtype = kwargs.pop("torch_dtype", None)
# derive the pipeline class to instantiate
custom_pipeline = kwargs.pop("custom_pipeline", None)
@@ -20,7 +20,7 @@ import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import PixArtImageProcessor
@@ -200,8 +200,8 @@ class SanaPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
def __init__(
self,
tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
text_encoder: Gemma2PreTrainedModel,
tokenizer: AutoTokenizer,
text_encoder: AutoModelForCausalLM,
vae: AutoencoderDC,
transformer: SanaTransformer2DModel,
scheduler: DPMSolverMultistepScheduler,
@@ -15,7 +15,7 @@
from typing import Callable, Dict, List, Optional, Union
import torch
from transformers import CLIPTextModelWithProjection, CLIPTokenizer
from transformers import CLIPTextModel, CLIPTokenizer
from ...models import StableCascadeUNet
from ...schedulers import DDPMWuerstchenScheduler
@@ -65,7 +65,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
Args:
tokenizer (`CLIPTokenizer`):
The CLIP tokenizer.
text_encoder (`CLIPTextModelWithProjection`):
text_encoder (`CLIPTextModel`):
The CLIP text encoder.
decoder ([`StableCascadeUNet`]):
The Stable Cascade decoder unet.
@@ -93,7 +93,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
self,
decoder: StableCascadeUNet,
tokenizer: CLIPTokenizer,
text_encoder: CLIPTextModelWithProjection,
text_encoder: CLIPTextModel,
scheduler: DDPMWuerstchenScheduler,
vqgan: PaellaVQModel,
latent_dim_scale: float = 10.67,
@@ -15,7 +15,7 @@ from typing import Callable, Dict, List, Optional, Union
import PIL
import torch
from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from ...models import StableCascadeUNet
from ...schedulers import DDPMWuerstchenScheduler
@@ -52,7 +52,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
Args:
tokenizer (`CLIPTokenizer`):
The decoder tokenizer to be used for text inputs.
text_encoder (`CLIPTextModelWithProjection`):
text_encoder (`CLIPTextModel`):
The decoder text encoder to be used for text inputs.
decoder (`StableCascadeUNet`):
The decoder model to be used for decoder image generation pipeline.
@@ -60,18 +60,14 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
The scheduler to be used for decoder image generation pipeline.
vqgan (`PaellaVQModel`):
The VQGAN model to be used for decoder image generation pipeline.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `image_encoder`.
image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
prior_prior (`StableCascadeUNet`):
The prior model to be used for prior pipeline.
prior_text_encoder (`CLIPTextModelWithProjection`):
The prior text encoder to be used for text inputs.
prior_tokenizer (`CLIPTokenizer`):
The prior tokenizer to be used for text inputs.
prior_scheduler (`DDPMWuerstchenScheduler`):
The scheduler to be used for prior pipeline.
prior_feature_extractor ([`~transformers.CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `image_encoder`.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
"""
_load_connected_pipes = True
@@ -80,12 +76,12 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
def __init__(
self,
tokenizer: CLIPTokenizer,
text_encoder: CLIPTextModelWithProjection,
text_encoder: CLIPTextModel,
decoder: StableCascadeUNet,
scheduler: DDPMWuerstchenScheduler,
vqgan: PaellaVQModel,
prior_prior: StableCascadeUNet,
prior_text_encoder: CLIPTextModelWithProjection,
prior_text_encoder: CLIPTextModel,
prior_tokenizer: CLIPTokenizer,
prior_scheduler: DDPMWuerstchenScheduler,
prior_feature_extractor: Optional[CLIPImageProcessor] = None,
@@ -141,7 +141,7 @@ class StableUnCLIPPipeline(
image_noising_scheduler: KarrasDiffusionSchedulers,
# regular denoising components
tokenizer: CLIPTokenizer,
text_encoder: CLIPTextModel,
text_encoder: CLIPTextModelWithProjection,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
# vae
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import (
BaseImageProcessor,
CLIPTextModelWithProjection,
CLIPTokenizer,
SiglipImageProcessor,
SiglipVisionModel,
PreTrainedModel,
T5EncoderModel,
T5TokenizerFast,
)
@@ -176,9 +176,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
tokenizer_3 (`T5TokenizerFast`):
Tokenizer of class
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
image_encoder (`SiglipVisionModel`, *optional*):
image_encoder (`PreTrainedModel`, *optional*):
Pre-trained Vision Model for IP Adapter.
feature_extractor (`SiglipImageProcessor`, *optional*):
feature_extractor (`BaseImageProcessor`, *optional*):
Image processor for IP Adapter.
"""
@@ -197,8 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
tokenizer_2: CLIPTokenizer,
text_encoder_3: T5EncoderModel,
tokenizer_3: T5TokenizerFast,
image_encoder: SiglipVisionModel = None,
feature_extractor: SiglipImageProcessor = None,
image_encoder: PreTrainedModel = None,
feature_extractor: BaseImageProcessor = None,
):
super().__init__()
@@ -18,10 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
import PIL.Image
import torch
from transformers import (
BaseImageProcessor,
CLIPTextModelWithProjection,
CLIPTokenizer,
SiglipImageProcessor,
SiglipVisionModel,
PreTrainedModel,
T5EncoderModel,
T5TokenizerFast,
)
@@ -197,10 +197,6 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
tokenizer_3 (`T5TokenizerFast`):
Tokenizer of class
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
image_encoder (`SiglipVisionModel`, *optional*):
Pre-trained Vision Model for IP Adapter.
feature_extractor (`SiglipImageProcessor`, *optional*):
Image processor for IP Adapter.
"""
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
@@ -218,8 +214,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
tokenizer_2: CLIPTokenizer,
text_encoder_3: T5EncoderModel,
tokenizer_3: T5TokenizerFast,
image_encoder: Optional[SiglipVisionModel] = None,
feature_extractor: Optional[SiglipImageProcessor] = None,
image_encoder: PreTrainedModel = None,
feature_extractor: BaseImageProcessor = None,
):
super().__init__()
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import (
BaseImageProcessor,
CLIPTextModelWithProjection,
CLIPTokenizer,
SiglipImageProcessor,
SiglipVisionModel,
PreTrainedModel,
T5EncoderModel,
T5TokenizerFast,
)
@@ -196,9 +196,9 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
tokenizer_3 (`T5TokenizerFast`):
Tokenizer of class
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
image_encoder (`SiglipVisionModel`, *optional*):
image_encoder (`PreTrainedModel`, *optional*):
Pre-trained Vision Model for IP Adapter.
feature_extractor (`SiglipImageProcessor`, *optional*):
feature_extractor (`BaseImageProcessor`, *optional*):
Image processor for IP Adapter.
"""
@@ -217,8 +217,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
tokenizer_2: CLIPTokenizer,
text_encoder_3: T5EncoderModel,
tokenizer_3: T5TokenizerFast,
image_encoder: Optional[SiglipVisionModel] = None,
feature_extractor: Optional[SiglipImageProcessor] = None,
image_encoder: PreTrainedModel = None,
feature_extractor: BaseImageProcessor = None,
):
super().__init__()
@@ -19,31 +19,15 @@ from typing import Callable, List, Optional, Union
import torch
from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
from transformers import (
CLIPImageProcessor,
CLIPTextModel,
CLIPTokenizer,
CLIPTokenizerFast,
)
from ...image_processor import VaeImageProcessor
from ...loaders import (
StableDiffusionLoraLoaderMixin,
TextualInversionLoaderMixin,
)
from ...models import AutoencoderKL, UNet2DConditionModel
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
from ...utils import (
USE_PEFT_BACKEND,
deprecate,
logging,
scale_lora_layers,
unscale_lora_layers,
)
from ...schedulers import LMSDiscreteScheduler
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from ..stable_diffusion import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -111,13 +95,13 @@ class StableDiffusionKDiffusionPipeline(
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast],
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
vae,
text_encoder,
tokenizer,
unet,
scheduler,
safety_checker,
feature_extractor,
requires_safety_checker: bool = True,
):
super().__init__()
-1
View File
@@ -123,7 +123,6 @@ from .state_dict_utils import (
convert_state_dict_to_peft,
convert_unet_state_dict_to_peft,
)
from .typing_utils import _get_detailed_type, _is_valid_type
logger = get_logger(__name__)
@@ -1217,21 +1217,6 @@ class MarigoldDepthPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class MarigoldIntrinsicsPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class MarigoldNormalsPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
@@ -1532,21 +1517,6 @@ class StableCascadePriorPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class StableDiffusion3ControlNetInpaintingPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class StableDiffusion3ControlNetPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
-91
View File
@@ -1,91 +0,0 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Typing utilities: Utilities related to type checking and validation
"""
from typing import Any, Dict, List, Set, Tuple, Type, Union, get_args, get_origin
def _is_valid_type(obj: Any, class_or_tuple: Union[Type, Tuple[Type, ...]]) -> bool:
"""
Checks if an object is an instance of any of the provided types. For collections, it checks if every element is of
the correct type as well.
"""
if not isinstance(class_or_tuple, tuple):
class_or_tuple = (class_or_tuple,)
# Unpack unions
unpacked_class_or_tuple = []
for t in class_or_tuple:
if get_origin(t) is Union:
unpacked_class_or_tuple.extend(get_args(t))
else:
unpacked_class_or_tuple.append(t)
class_or_tuple = tuple(unpacked_class_or_tuple)
if Any in class_or_tuple:
return True
obj_type = type(obj)
# Classes with obj's type
class_or_tuple = {t for t in class_or_tuple if isinstance(obj, get_origin(t) or t)}
# Singular types (e.g. int, ControlNet, ...)
# Untyped collections (e.g. List, but not List[int])
elem_class_or_tuple = {get_args(t) for t in class_or_tuple}
if () in elem_class_or_tuple:
return True
# Typed lists or sets
elif obj_type in (list, set):
return any(all(_is_valid_type(x, t) for x in obj) for t in elem_class_or_tuple)
# Typed tuples
elif obj_type is tuple:
return any(
# Tuples with any length and single type (e.g. Tuple[int, ...])
(len(t) == 2 and t[-1] is Ellipsis and all(_is_valid_type(x, t[0]) for x in obj))
or
# Tuples with fixed length and any types (e.g. Tuple[int, str])
(len(obj) == len(t) and all(_is_valid_type(x, tt) for x, tt in zip(obj, t)))
for t in elem_class_or_tuple
)
# Typed dicts
elif obj_type is dict:
return any(
all(_is_valid_type(k, kt) and _is_valid_type(v, vt) for k, v in obj.items())
for kt, vt in elem_class_or_tuple
)
else:
return False
def _get_detailed_type(obj: Any) -> Type:
"""
Gets a detailed type for an object, including nested types for collections.
"""
obj_type = type(obj)
if obj_type in (list, set):
obj_origin_type = List if obj_type is list else Set
elems_type = Union[tuple({_get_detailed_type(x) for x in obj})]
return obj_origin_type[elems_type]
elif obj_type is tuple:
return Tuple[tuple(_get_detailed_type(x) for x in obj)]
elif obj_type is dict:
keys_type = Union[tuple({_get_detailed_type(k) for k in obj.keys()})]
values_type = Union[tuple({_get_detailed_type(k) for k in obj.values()})]
return Dict[keys_type, values_type]
else:
return obj_type
+2 -2
View File
@@ -18,7 +18,7 @@ from typing import Optional, Tuple, Union
import torch
from diffusers import DiffusionPipeline, ImagePipelineOutput, SchedulerMixin, UNet2DModel
from diffusers import DiffusionPipeline, ImagePipelineOutput
class CustomLocalPipeline(DiffusionPipeline):
@@ -33,7 +33,7 @@ class CustomLocalPipeline(DiffusionPipeline):
[`DDPMScheduler`], or [`DDIMScheduler`].
"""
def __init__(self, unet: UNet2DModel, scheduler: SchedulerMixin):
def __init__(self, unet, scheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
+1 -2
View File
@@ -18,7 +18,6 @@ from typing import Optional, Tuple, Union
import torch
from diffusers import SchedulerMixin, UNet2DModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -34,7 +33,7 @@ class CustomLocalPipeline(DiffusionPipeline):
[`DDPMScheduler`], or [`DDIMScheduler`].
"""
def __init__(self, unet: UNet2DModel, scheduler: SchedulerMixin):
def __init__(self, unet, scheduler):
super().__init__()
self.register_modules(unet=unet, scheduler=scheduler)
+2 -42
View File
@@ -15,8 +15,6 @@
import sys
import unittest
import numpy as np
import pytest
import torch
from transformers import AutoTokenizer, GemmaForCausalLM
@@ -26,12 +24,12 @@ from diffusers import (
Lumina2Text2ImgPipeline,
Lumina2Transformer2DModel,
)
from diffusers.utils.testing_utils import floats_tensor, is_torch_version, require_peft_backend, skip_mps, torch_device
from diffusers.utils.testing_utils import floats_tensor, require_peft_backend
sys.path.append(".")
from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set # noqa: E402
from utils import PeftLoraLoaderMixinTests # noqa: E402
@require_peft_backend
@@ -132,41 +130,3 @@ class Lumina2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@unittest.skip("Text encoder LoRA is not supported in Lumina2.")
def test_simple_inference_with_text_lora_save_load(self):
pass
@skip_mps
@pytest.mark.xfail(
condition=torch.device(torch_device).type == "cpu" and is_torch_version(">=", "2.5"),
reason="Test currently fails on CPU and PyTorch 2.5.1 but not on PyTorch 2.4.1.",
strict=False,
)
def test_lora_fuse_nan(self):
for scheduler_cls in self.scheduler_classes:
components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
_, _, inputs = self.get_dummy_inputs(with_generator=False)
if "text_encoder" in self.pipeline_class._lora_loadable_modules:
pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
self.assertTrue(
check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
)
denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
denoiser.add_adapter(denoiser_lora_config, "adapter-1")
self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
# corrupt one LoRA weight with `inf` values
with torch.no_grad():
pipe.transformer.layers[0].attn.to_q.lora_A["adapter-1"].weight += float("inf")
# with `safe_fusing=True` we should see an Error
with self.assertRaises(ValueError):
pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
# without we should not see an error, but every image will be black
pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
out = pipe(**inputs)[0]
self.assertTrue(np.isnan(out).all())
+5 -4
View File
@@ -1169,16 +1169,17 @@ class ModelTesterMixin:
base_output = model(**inputs_dict)
model_size = compute_module_sizes(model)[""]
max_size = int(self.model_split_percents[0] * model_size)
# Force disk offload by setting very small CPU memory
max_memory = {0: max_size, "cpu": int(0.1 * max_size)}
with tempfile.TemporaryDirectory() as tmp_dir:
model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
with self.assertRaises(ValueError):
max_size = int(self.model_split_percents[0] * model_size)
max_memory = {0: max_size, "cpu": max_size}
# This errors out because it's missing an offload folder
new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
max_size = int(self.model_split_percents[0] * model_size)
max_memory = {0: max_size, "cpu": max_size}
new_model = self.model_class.from_pretrained(
tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
)
@@ -30,7 +30,6 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = OmniGenTransformer2DModel
main_input_name = "hidden_states"
uses_custom_attn_processor = True
model_split_percents = [0.1, 0.1, 0.1]
@property
def dummy_input(self):
@@ -74,9 +73,9 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
"num_attention_heads": 4,
"num_key_value_heads": 4,
"intermediate_size": 32,
"num_layers": 20,
"num_layers": 1,
"pad_token_id": 0,
"vocab_size": 1000,
"vocab_size": 100,
"in_channels": 4,
"time_step_dim": 4,
"rope_scaling": {"long_factor": list(range(1, 3)), "short_factor": list(range(1, 3))},
@@ -33,7 +33,6 @@ enable_full_determinism()
class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = SD3Transformer2DModel
main_input_name = "hidden_states"
model_split_percents = [0.8, 0.8, 0.9]
@property
def dummy_input(self):
@@ -68,7 +67,7 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
"sample_size": 32,
"patch_size": 1,
"in_channels": 4,
"num_layers": 4,
"num_layers": 1,
"attention_head_dim": 8,
"num_attention_heads": 4,
"caption_projection_dim": 32,
@@ -108,7 +107,6 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = SD3Transformer2DModel
main_input_name = "hidden_states"
model_split_percents = [0.8, 0.8, 0.9]
@property
def dummy_input(self):
@@ -143,7 +141,7 @@ class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
"sample_size": 32,
"patch_size": 1,
"in_channels": 4,
"num_layers": 4,
"num_layers": 2,
"attention_head_dim": 8,
"num_attention_heads": 4,
"caption_projection_dim": 32,
+1 -3
View File
@@ -89,9 +89,7 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
sample_size=128,
)
torch.manual_seed(0)
text_encoder = ChatGLMModel.from_pretrained(
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
)
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
components = {
@@ -93,9 +93,7 @@ class KolorsPipelineImg2ImgFastTests(PipelineTesterMixin, unittest.TestCase):
sample_size=128,
)
torch.manual_seed(0)
text_encoder = ChatGLMModel.from_pretrained(
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
)
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
components = {
@@ -91,10 +91,10 @@ class Lumina2Text2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTester
text_encoder = Gemma2Model(config)
components = {
"transformer": transformer,
"transformer": transformer.eval(),
"vae": vae.eval(),
"scheduler": scheduler,
"text_encoder": text_encoder,
"text_encoder": text_encoder.eval(),
"tokenizer": tokenizer,
}
return components
@@ -1,5 +1,5 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# Marigold project website: https://marigoldmonodepth.github.io
# --------------------------------------------------------------------------
import gc
import random
@@ -1,571 +0,0 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# --------------------------------------------------------------------------
import gc
import random
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
import diffusers
from diffusers import (
AutoencoderKL,
AutoencoderTiny,
DDIMScheduler,
MarigoldIntrinsicsPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
load_image,
require_torch_gpu,
slow,
torch_device,
)
from ..test_pipelines_common import PipelineTesterMixin, to_np
enable_full_determinism()
class MarigoldIntrinsicsPipelineTesterMixin(PipelineTesterMixin):
def _test_inference_batch_single_identical(
self,
batch_size=2,
expected_max_diff=1e-4,
additional_params_copy_to_batched_inputs=["num_inference_steps"],
):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for components in pipe.components.values():
if hasattr(components, "set_default_attn_processor"):
components.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
# Reset generator in case it is has been used in self.get_dummy_inputs
inputs["generator"] = self.get_generator(0)
logger = diffusers.logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
# batchify inputs
batched_inputs = {}
batched_inputs.update(inputs)
for name in self.batch_params:
if name not in inputs:
continue
value = inputs[name]
if name == "prompt":
len_prompt = len(value)
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
batched_inputs[name][-1] = 100 * "very long"
else:
batched_inputs[name] = batch_size * [value]
if "generator" in inputs:
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
if "batch_size" in inputs:
batched_inputs["batch_size"] = batch_size
for arg in additional_params_copy_to_batched_inputs:
batched_inputs[arg] = inputs[arg]
output = pipe(**inputs)
output_batch = pipe(**batched_inputs)
assert output_batch[0].shape[0] == batch_size * output[0].shape[0] # only changed here
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
assert max_diff < expected_max_diff
def _test_inference_batch_consistent(
self, batch_sizes=[2], additional_params_copy_to_batched_inputs=["num_inference_steps"], batch_generator=True
):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
inputs["generator"] = self.get_generator(0)
logger = diffusers.logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
# prepare batched inputs
batched_inputs = []
for batch_size in batch_sizes:
batched_input = {}
batched_input.update(inputs)
for name in self.batch_params:
if name not in inputs:
continue
value = inputs[name]
if name == "prompt":
len_prompt = len(value)
# make unequal batch sizes
batched_input[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
# make last batch super long
batched_input[name][-1] = 100 * "very long"
else:
batched_input[name] = batch_size * [value]
if batch_generator and "generator" in inputs:
batched_input["generator"] = [self.get_generator(i) for i in range(batch_size)]
if "batch_size" in inputs:
batched_input["batch_size"] = batch_size
batched_inputs.append(batched_input)
logger.setLevel(level=diffusers.logging.WARNING)
for batch_size, batched_input in zip(batch_sizes, batched_inputs):
output = pipe(**batched_input)
assert len(output[0]) == batch_size * pipe.n_targets # only changed here
class MarigoldIntrinsicsPipelineFastTests(MarigoldIntrinsicsPipelineTesterMixin, unittest.TestCase):
pipeline_class = MarigoldIntrinsicsPipeline
params = frozenset(["image"])
batch_params = frozenset(["image"])
image_params = frozenset(["image"])
image_latents_params = frozenset(["latents"])
callback_cfg_params = frozenset([])
test_xformers_attention = False
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"output_type",
]
)
def get_dummy_components(self, time_cond_proj_dim=None):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
time_cond_proj_dim=time_cond_proj_dim,
sample_size=32,
in_channels=12,
out_channels=8,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
torch.manual_seed(0)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
prediction_type="v_prediction",
set_alpha_to_one=False,
steps_offset=1,
beta_schedule="scaled_linear",
clip_sample=False,
thresholding=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"prediction_type": "intrinsics",
}
return components
def get_dummy_tiny_autoencoder(self):
return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image / 2 + 0.5
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"image": image,
"num_inference_steps": 1,
"processing_resolution": 0,
"generator": generator,
"output_type": "np",
}
return inputs
def _test_marigold_intrinsics(
self,
generator_seed: int = 0,
expected_slice: np.ndarray = None,
atol: float = 1e-4,
**pipe_kwargs,
):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
pipe_inputs.update(**pipe_kwargs)
prediction = pipe(**pipe_inputs).prediction
prediction_slice = prediction[0, -3:, -3:, -1].flatten()
if pipe_inputs.get("match_input_resolution", True):
self.assertEqual(prediction.shape, (2, 32, 32, 3), "Unexpected output resolution")
else:
self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
self.assertEqual(
max(prediction.shape[1:3]),
pipe_inputs.get("processing_resolution", 768),
"Unexpected output resolution",
)
np.set_printoptions(precision=5, suppress=True)
msg = f"{prediction_slice}"
self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
# self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
def test_marigold_depth_dummy_defaults(self):
self._test_marigold_intrinsics(
expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
)
def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
num_inference_steps=1,
processing_resolution=32,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.53132, 0.44487, 0.40164, 0.5326, 0.49073, 0.46979, 0.53324, 0.51366, 0.50387]),
num_inference_steps=1,
processing_resolution=16,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=2024,
expected_slice=np.array([0.40257, 0.39468, 0.51373, 0.4161, 0.40162, 0.58535, 0.43581, 0.47834, 0.48951]),
num_inference_steps=1,
processing_resolution=32,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.49636, 0.4518, 0.42722, 0.59044, 0.6362, 0.39011, 0.53522, 0.55153, 0.48699]),
num_inference_steps=2,
processing_resolution=32,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.55547, 0.43511, 0.4887, 0.56399, 0.63867, 0.56337, 0.47889, 0.52925, 0.49235]),
num_inference_steps=1,
processing_resolution=64,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.57249, 0.49824, 0.54438, 0.57733, 0.52404, 0.5255, 0.56493, 0.56336, 0.48579]),
num_inference_steps=1,
processing_resolution=32,
ensemble_size=3,
ensembling_kwargs={"reduction": "mean"},
batch_size=1,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.6294, 0.5575, 0.53414, 0.61077, 0.57156, 0.53974, 0.52956, 0.55467, 0.48751]),
num_inference_steps=1,
processing_resolution=32,
ensemble_size=4,
ensembling_kwargs={"reduction": "mean"},
batch_size=2,
match_input_resolution=True,
)
def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
self._test_marigold_intrinsics(
generator_seed=0,
expected_slice=np.array([0.63511, 0.68137, 0.48783, 0.46689, 0.58505, 0.36757, 0.58465, 0.54302, 0.50387]),
num_inference_steps=1,
processing_resolution=16,
ensemble_size=1,
batch_size=1,
match_input_resolution=False,
)
def test_marigold_depth_dummy_no_num_inference_steps(self):
with self.assertRaises(ValueError) as e:
self._test_marigold_intrinsics(
num_inference_steps=None,
expected_slice=np.array([0.0]),
)
self.assertIn("num_inference_steps", str(e))
def test_marigold_depth_dummy_no_processing_resolution(self):
with self.assertRaises(ValueError) as e:
self._test_marigold_intrinsics(
processing_resolution=None,
expected_slice=np.array([0.0]),
)
self.assertIn("processing_resolution", str(e))
@slow
@require_torch_gpu
class MarigoldIntrinsicsPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
def _test_marigold_intrinsics(
self,
is_fp16: bool = True,
device: str = "cuda",
generator_seed: int = 0,
expected_slice: np.ndarray = None,
model_id: str = "prs-eth/marigold-iid-appearance-v1-1",
image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
atol: float = 1e-4,
**pipe_kwargs,
):
from_pretrained_kwargs = {}
if is_fp16:
from_pretrained_kwargs["variant"] = "fp16"
from_pretrained_kwargs["torch_dtype"] = torch.float16
pipe = MarigoldIntrinsicsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
if device == "cuda":
pipe.enable_model_cpu_offload()
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device=device).manual_seed(generator_seed)
image = load_image(image_url)
width, height = image.size
prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
prediction_slice = prediction[0, -3:, -3:, -1].flatten()
if pipe_kwargs.get("match_input_resolution", True):
self.assertEqual(prediction.shape, (2, height, width, 3), "Unexpected output resolution")
else:
self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
self.assertEqual(
max(prediction.shape[1:3]),
pipe_kwargs.get("processing_resolution", 768),
"Unexpected output resolution",
)
msg = f"{prediction_slice}"
self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
# self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
def test_marigold_intrinsics_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=False,
device="cpu",
generator_seed=0,
expected_slice=np.array([0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162]),
num_inference_steps=1,
processing_resolution=32,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=False,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.62127, 0.61906, 0.61687, 0.61946, 0.61903, 0.61961, 0.61808, 0.62099, 0.62894]),
num_inference_steps=1,
processing_resolution=768,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.62109, 0.61914, 0.61719, 0.61963, 0.61914, 0.61963, 0.61816, 0.62109, 0.62891]),
num_inference_steps=1,
processing_resolution=768,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=2024,
expected_slice=np.array([0.64111, 0.63916, 0.63623, 0.63965, 0.63916, 0.63965, 0.6377, 0.64062, 0.64941]),
num_inference_steps=1,
processing_resolution=768,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.60254, 0.60059, 0.59961, 0.60156, 0.60107, 0.60205, 0.60254, 0.60449, 0.61133]),
num_inference_steps=2,
processing_resolution=768,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.64551, 0.64453, 0.64404, 0.64502, 0.64844, 0.65039, 0.64502, 0.65039, 0.65332]),
num_inference_steps=1,
processing_resolution=512,
ensemble_size=1,
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.61572, 0.61377, 0.61182, 0.61426, 0.61377, 0.61426, 0.61279, 0.61572, 0.62354]),
num_inference_steps=1,
processing_resolution=768,
ensemble_size=3,
ensembling_kwargs={"reduction": "mean"},
batch_size=1,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.61914, 0.6167, 0.61475, 0.61719, 0.61719, 0.61768, 0.61572, 0.61914, 0.62695]),
num_inference_steps=1,
processing_resolution=768,
ensemble_size=4,
ensembling_kwargs={"reduction": "mean"},
batch_size=2,
match_input_resolution=True,
)
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
self._test_marigold_intrinsics(
is_fp16=True,
device="cuda",
generator_seed=0,
expected_slice=np.array([0.65332, 0.64697, 0.64648, 0.64844, 0.64697, 0.64111, 0.64941, 0.64209, 0.65332]),
num_inference_steps=1,
processing_resolution=512,
ensemble_size=1,
batch_size=1,
match_input_resolution=False,
)
@@ -1,5 +1,5 @@
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
# limitations under the License.
# --------------------------------------------------------------------------
# More information and citation instructions are available on the
# Marigold project website: https://marigoldcomputervision.github.io
# Marigold project website: https://marigoldmonodepth.github.io
# --------------------------------------------------------------------------
import gc
import random
+1 -3
View File
@@ -98,9 +98,7 @@ class KolorsPAGPipelineFastTests(
sample_size=128,
)
torch.manual_seed(0)
text_encoder = ChatGLMModel.from_pretrained(
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
)
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
components = {
-41
View File
@@ -527,9 +527,7 @@ class FluxIPAdapterTesterMixin:
The following scenarios are tested:
- Single IP-Adapter with scale=0 should produce same output as no IP-Adapter.
- Multi IP-Adapter with scale=0 should produce same output as no IP-Adapter.
- Single IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
- Multi IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
"""
# Raising the tolerance for this test when it's run on a CPU because we
# compare against static slices and that can be shaky (with a VVVV low probability).
@@ -547,7 +545,6 @@ class FluxIPAdapterTesterMixin:
else:
output_without_adapter = expected_pipe_slice
# 1. Single IP-Adapter test cases
adapter_state_dict = create_flux_ip_adapter_state_dict(pipe.transformer)
pipe.transformer._load_ip_adapter_weights(adapter_state_dict)
@@ -581,44 +578,6 @@ class FluxIPAdapterTesterMixin:
max_diff_with_adapter_scale, 1e-2, "Output with ip-adapter must be different from normal inference"
)
# 2. Multi IP-Adapter test cases
adapter_state_dict_1 = create_flux_ip_adapter_state_dict(pipe.transformer)
adapter_state_dict_2 = create_flux_ip_adapter_state_dict(pipe.transformer)
pipe.transformer._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
# forward pass with multi ip adapter, but scale=0 which should have no effect
inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
pipe.set_ip_adapter_scale([0.0, 0.0])
output_without_multi_adapter_scale = pipe(**inputs)[0]
if expected_pipe_slice is not None:
output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
# forward pass with multi ip adapter, but with scale of adapter weights
inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
pipe.set_ip_adapter_scale([42.0, 42.0])
output_with_multi_adapter_scale = pipe(**inputs)[0]
if expected_pipe_slice is not None:
output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
max_diff_without_multi_adapter_scale = np.abs(
output_without_multi_adapter_scale - output_without_adapter
).max()
max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
self.assertLess(
max_diff_without_multi_adapter_scale,
expected_max_diff,
"Output without multi-ip-adapter must be same as normal inference",
)
self.assertGreater(
max_diff_with_multi_adapter_scale,
1e-2,
"Output with multi-ip-adapter scale must be different from normal inference",
)
class PipelineLatentTesterMixin:
"""
-61
View File
@@ -1,61 +0,0 @@
import argparse
import inspect
import sys
from pathlib import Path
from typing import List, Type
root_dir = Path(__file__).parent.parent.absolute()
sys.path.insert(0, str(root_dir))
parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, default=None)
args = parser.parse_args()
def get_test_methods_from_class(cls: Type) -> List[str]:
"""
Get all test method names from a given class.
Only returns methods that start with 'test_'.
"""
test_methods = []
for name, obj in inspect.getmembers(cls):
if name.startswith("test_") and inspect.isfunction(obj):
test_methods.append(name)
return sorted(test_methods)
def generate_pytest_pattern(test_methods: List[str]) -> str:
"""Generate pytest pattern string for the -k flag."""
return " or ".join(test_methods)
def generate_pattern_for_mixin(mixin_class: Type) -> str:
"""
Generate pytest pattern for a specific mixin class.
"""
if mixin_cls is None:
return ""
test_methods = get_test_methods_from_class(mixin_class)
return generate_pytest_pattern(test_methods)
if __name__ == "__main__":
mixin_cls = None
if args.type == "pipeline":
from tests.pipelines.test_pipelines_common import PipelineTesterMixin
mixin_cls = PipelineTesterMixin
elif args.type == "models":
from tests.models.test_modeling_common import ModelTesterMixin
mixin_cls = ModelTesterMixin
elif args.type == "lora":
from tests.lora.utils import PeftLoraLoaderMixinTests
mixin_cls = PeftLoraLoaderMixinTests
pattern = generate_pattern_for_mixin(mixin_cls)
print(pattern)