Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 07798babac |
@@ -9,33 +9,12 @@ permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
run-style-bot:
|
||||
if: >
|
||||
contains(github.event.comment.body, '@bot /style') &&
|
||||
github.event.issue.pull_request != null
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
is_authorized: ${{ steps.check_user_permission.outputs.has_permission }}
|
||||
steps:
|
||||
- name: Check user permission
|
||||
id: check_user_permission
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const comment_user = context.payload.comment.user.login;
|
||||
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
username: comment_user
|
||||
});
|
||||
const authorized = permission.permission === 'admin';
|
||||
console.log(`User ${comment_user} has permission level: ${permission.permission}, authorized: ${authorized} (only admins allowed)`);
|
||||
core.setOutput('has_permission', authorized);
|
||||
|
||||
run-style-bot:
|
||||
needs: check-permissions
|
||||
if: needs.check-permissions.outputs.is_authorized == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Extract PR details
|
||||
id: pr_info
|
||||
@@ -74,9 +53,9 @@ jobs:
|
||||
HEADREF: ${{ steps.pr_info.outputs.headRef }}
|
||||
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
|
||||
run: |
|
||||
echo "PR number: $PRNUMBER"
|
||||
echo "Head Ref: $HEADREF"
|
||||
echo "Head Repo Full Name: $HEADREPOFULLNAME"
|
||||
echo "PR number: ${{ env.PRNUMBER }}"
|
||||
echo "Head Ref: ${{ env.HEADREF }}"
|
||||
echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
@@ -85,38 +64,18 @@ jobs:
|
||||
run: |
|
||||
pip install .[quality]
|
||||
|
||||
- name: Download necessary files from main branch of Diffusers
|
||||
- name: Download Makefile from main branch
|
||||
run: |
|
||||
curl -o main_Makefile https://raw.githubusercontent.com/huggingface/diffusers/main/Makefile
|
||||
curl -o main_setup.py https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/setup.py
|
||||
curl -o main_check_doc_toc.py https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/utils/check_doc_toc.py
|
||||
|
||||
- name: Compare the files and raise error if needed
|
||||
- name: Compare Makefiles
|
||||
run: |
|
||||
diff_failed=0
|
||||
|
||||
if ! diff -q main_Makefile Makefile; then
|
||||
echo "Error: The Makefile has changed. Please ensure it matches the main branch."
|
||||
diff_failed=1
|
||||
fi
|
||||
|
||||
if ! diff -q main_setup.py setup.py; then
|
||||
echo "Error: The setup.py has changed. Please ensure it matches the main branch."
|
||||
diff_failed=1
|
||||
fi
|
||||
|
||||
if ! diff -q main_check_doc_toc.py utils/check_doc_toc.py; then
|
||||
echo "Error: The utils/check_doc_toc.py has changed. Please ensure it matches the main branch."
|
||||
diff_failed=1
|
||||
fi
|
||||
|
||||
if [ $diff_failed -eq 1 ]; then
|
||||
echo "❌ Error happened as we detected changes in the files that should not be changed ❌"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "No changes in the files. Proceeding..."
|
||||
rm -rf main_Makefile main_setup.py main_check_doc_toc.py
|
||||
echo "No changes in Makefile. Proceeding..."
|
||||
rm -rf main_Makefile
|
||||
|
||||
- name: Run make style and make quality
|
||||
run: |
|
||||
@@ -130,20 +89,20 @@ jobs:
|
||||
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
echo "HEADREPOFULLNAME: $HEADREPOFULLNAME, HEADREF: $HEADREF"
|
||||
echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
|
||||
# Configure git with the Actions bot user
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
# Make sure your 'origin' remote is set to the contributor's fork
|
||||
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/$HEADREPOFULLNAME.git"
|
||||
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
|
||||
|
||||
# If there are changes after running style/quality, commit them
|
||||
if [ -n "$(git status --porcelain)" ]; then
|
||||
git add .
|
||||
git commit -m "Apply style fixes"
|
||||
# Push to the original contributor's forked branch
|
||||
git push origin HEAD:$HEADREF
|
||||
git push origin HEAD:${{ env.HEADREF }}
|
||||
echo "changes_pushed=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "No changes to commit."
|
||||
|
||||
@@ -1,250 +0,0 @@
|
||||
name: Fast GPU Tests on PR
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: main
|
||||
paths:
|
||||
- "src/diffusers/models/modeling_utils.py"
|
||||
- "src/diffusers/models/model_loading_utils.py"
|
||||
- "src/diffusers/pipelines/pipeline_utils.py"
|
||||
- "src/diffusers/pipeline_loading_utils.py"
|
||||
- "src/diffusers/loaders/lora_base.py"
|
||||
- "src/diffusers/loaders/lora_pipeline.py"
|
||||
- "src/diffusers/loaders/peft.py"
|
||||
- "tests/pipelines/test_pipelines_common.py"
|
||||
- "tests/models/test_modeling_common.py"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 1
|
||||
PYTEST_TIMEOUT: 600
|
||||
PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
|
||||
|
||||
jobs:
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines CUDA Slow Tests Matrix
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test-pipelines.json
|
||||
path: reports
|
||||
|
||||
torch_pipelines_cuda_tests:
|
||||
name: Torch Pipelines CUDA Tests
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 8
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
- name: Extract tests
|
||||
id: extract_tests
|
||||
run: |
|
||||
pattern=$(python utils/extract_tests_from_mixin.py --type pipeline)
|
||||
echo "$pattern" > /tmp/test_pattern.txt
|
||||
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: PyTorch CUDA checkpoint tests on Ubuntu
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
if [ "${{ matrix.module }}" = "ip_adapters" ]; then
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
else
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and $pattern" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
fi
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: pipeline_${{ matrix.module }}_test_reports
|
||||
path: reports
|
||||
|
||||
torch_cuda_tests:
|
||||
name: Torch CUDA Tests
|
||||
runs-on:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
module: [models, schedulers, lora, others]
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Extract tests
|
||||
id: extract_tests
|
||||
run: |
|
||||
pattern=$(python utils/extract_tests_from_mixin.py --type ${{ matrix.module }})
|
||||
echo "$pattern" > /tmp/test_pattern.txt
|
||||
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Run PyTorch CUDA tests
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
if [ -z "$pattern" ]; then
|
||||
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
else
|
||||
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
fi
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
|
||||
cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: torch_cuda_test_reports_${{ matrix.module }}
|
||||
path: reports
|
||||
|
||||
run_examples_tests:
|
||||
name: Examples PyTorch CUDA tests on Ubuntu
|
||||
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
runs-on:
|
||||
group: aws-g4dn-2xlarge
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install timm
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/examples_torch_cuda_stats.txt
|
||||
cat reports/examples_torch_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: examples_test_reports
|
||||
path: reports
|
||||
|
||||
@@ -1,6 +1,13 @@
|
||||
name: Fast GPU Tests on main
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: main
|
||||
paths:
|
||||
- "src/diffusers/models/modeling_utils.py"
|
||||
- "src/diffusers/models/model_loading_utils.py"
|
||||
- "src/diffusers/pipelines/pipeline_utils.py"
|
||||
- "src/diffusers/pipeline_loading_utils.py"
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
@@ -160,6 +167,7 @@ jobs:
|
||||
path: reports
|
||||
|
||||
flax_tpu_tests:
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
name: Flax TPU Tests
|
||||
runs-on:
|
||||
group: gcp-ct5lp-hightpu-8t
|
||||
@@ -208,6 +216,7 @@ jobs:
|
||||
path: reports
|
||||
|
||||
onnx_cuda_tests:
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
name: ONNX CUDA Tests
|
||||
runs-on:
|
||||
group: aws-g4dn-2xlarge
|
||||
@@ -256,6 +265,7 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_torch_compile_tests:
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
name: PyTorch Compile CUDA tests
|
||||
|
||||
runs-on:
|
||||
@@ -299,6 +309,7 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_xformers_tests:
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
name: PyTorch xformers CUDA tests
|
||||
|
||||
runs-on:
|
||||
|
||||
@@ -543,10 +543,6 @@
|
||||
title: Overview
|
||||
- local: api/schedulers/cm_stochastic_iterative
|
||||
title: CMStochasticIterativeScheduler
|
||||
- local: api/schedulers/ddim_cogvideox
|
||||
title: CogVideoXDDIMScheduler
|
||||
- local: api/schedulers/multistep_dpm_solver_cogvideox
|
||||
title: CogVideoXDPMScheduler
|
||||
- local: api/schedulers/consistency_decoder
|
||||
title: ConsistencyDecoderScheduler
|
||||
- local: api/schedulers/cosine_dpm
|
||||
|
||||
@@ -359,74 +359,8 @@ image.save('flux_ip_adapter_output.jpg')
|
||||
<figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
|
||||
</div>
|
||||
|
||||
## Optimize
|
||||
|
||||
Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeling components. Enable some of the optimizations below to lower the memory requirements.
|
||||
|
||||
### Group offloading
|
||||
|
||||
[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
|
||||
|
||||
On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
|
||||
|
||||
> [!TIP]
|
||||
> It is possible to mix block and leaf-level offloading for different components in a pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import FluxPipeline
|
||||
from diffusers.hooks import apply_group_offloading
|
||||
|
||||
model_id = "black-forest-labs/FLUX.1-dev"
|
||||
dtype = torch.bfloat16
|
||||
pipe = FluxPipeline.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=dtype,
|
||||
)
|
||||
|
||||
apply_group_offloading(
|
||||
pipe.transformer,
|
||||
offload_type="leaf_level",
|
||||
offload_device=torch.device("cpu"),
|
||||
onload_device=torch.device("cuda"),
|
||||
use_stream=True,
|
||||
)
|
||||
apply_group_offloading(
|
||||
pipe.text_encoder,
|
||||
offload_device=torch.device("cpu"),
|
||||
onload_device=torch.device("cuda"),
|
||||
offload_type="leaf_level",
|
||||
use_stream=True,
|
||||
)
|
||||
apply_group_offloading(
|
||||
pipe.text_encoder_2,
|
||||
offload_device=torch.device("cpu"),
|
||||
onload_device=torch.device("cuda"),
|
||||
offload_type="leaf_level",
|
||||
use_stream=True,
|
||||
)
|
||||
apply_group_offloading(
|
||||
pipe.vae,
|
||||
offload_device=torch.device("cpu"),
|
||||
onload_device=torch.device("cuda"),
|
||||
offload_type="leaf_level",
|
||||
use_stream=True,
|
||||
)
|
||||
|
||||
prompt="A cat wearing sunglasses and working as a lifeguard at pool."
|
||||
|
||||
generator = torch.Generator().manual_seed(181201)
|
||||
image = pipe(
|
||||
prompt,
|
||||
width=576,
|
||||
height=1024,
|
||||
num_inference_steps=30,
|
||||
generator=generator
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### Running FP16 inference
|
||||
## Running FP16 inference
|
||||
|
||||
Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
|
||||
|
||||
@@ -455,7 +389,7 @@ out = pipe(
|
||||
out.save("image.png")
|
||||
```
|
||||
|
||||
### Quantization
|
||||
## Quantization
|
||||
|
||||
Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
|
||||
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
<!--
|
||||
Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -12,120 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Marigold Computer Vision
|
||||
# Marigold Pipelines for Computer Vision Tasks
|
||||
|
||||

|
||||
|
||||
Marigold was proposed in
|
||||
[Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145),
|
||||
a CVPR 2024 Oral paper by
|
||||
[Bingxin Ke](http://www.kebingxin.com/),
|
||||
[Anton Obukhov](https://www.obukhov.ai/),
|
||||
[Shengyu Huang](https://shengyuh.github.io/),
|
||||
[Nando Metzger](https://nandometzger.github.io/),
|
||||
[Rodrigo Caye Daudt](https://rcdaudt.github.io/), and
|
||||
[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
|
||||
The core idea is to **repurpose the generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional
|
||||
computer vision tasks**.
|
||||
This approach was explored by fine-tuning Stable Diffusion for **Monocular Depth Estimation**, as demonstrated in the
|
||||
teaser above.
|
||||
Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
|
||||
The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks.
|
||||
Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above.
|
||||
Later,
|
||||
- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
|
||||
- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
|
||||
- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).
|
||||
|
||||
Marigold was later extended in the follow-up paper,
|
||||
[Marigold: Affordable Adaptation of Diffusion-Based Image Generators for Image Analysis](https://huggingface.co/papers/2312.02145),
|
||||
authored by
|
||||
[Bingxin Ke](http://www.kebingxin.com/),
|
||||
[Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US),
|
||||
[Tianfu Wang](https://tianfwang.github.io/),
|
||||
[Nando Metzger](https://nandometzger.github.io/),
|
||||
[Shengyu Huang](https://shengyuh.github.io/),
|
||||
[Bo Li](https://www.linkedin.com/in/bobboli0202/),
|
||||
[Anton Obukhov](https://www.obukhov.ai/), and
|
||||
[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
|
||||
This work expanded Marigold to support new modalities such as **Surface Normals** and **Intrinsic Image Decomposition**
|
||||
(IID), introduced a training protocol for **Latent Consistency Models** (LCM), and demonstrated **High-Resolution** (HR)
|
||||
processing capability.
|
||||
The abstract from the paper is:
|
||||
|
||||
<Tip>
|
||||
|
||||
The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
|
||||
LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
|
||||
Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal
|
||||
results in as few as 1 to 4 steps.
|
||||
|
||||
</Tip>
|
||||
*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*
|
||||
|
||||
## Available Pipelines
|
||||
|
||||
Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a
|
||||
corresponding prediction.
|
||||
Currently, the following computer vision tasks are implemented:
|
||||
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
|
||||
Currently, the following tasks are implemented:
|
||||
|
||||
| Pipeline | Predicted Modalities | Demos |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
|
||||
|
||||
| Pipeline | Recommended Model Checkpoints | Spaces (Interactive Apps) | Predicted Modalities |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1) | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) |
|
||||
| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection) |
|
||||
|
||||
## Available Checkpoints
|
||||
|
||||
All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
|
||||
They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train
|
||||
new model checkpoints.
|
||||
The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps.
|
||||
|
||||
| Checkpoint | Modality | Comment |
|
||||
|-----------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | Depth | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
|
||||
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1. |
|
||||
| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity. |
|
||||
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image  \\(I\\)  is comprised of Albedo  \\(A\\), Diffuse shading  \\(S\\), and Non-diffuse residual  \\(R\\):  \\(I = A*S+R\\). |
|
||||
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff
|
||||
between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to
|
||||
efficiently load the same components into multiple pipelines.
|
||||
Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section
|
||||
[here](../../using-diffusers/svd#reduce-memory-usage).
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
|
||||
The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
|
||||
To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the
|
||||
API reference).
|
||||
Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration
|
||||
file (`model_index.json`).
|
||||
This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
|
||||
Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`.
|
||||
Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
|
||||
Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference).
|
||||
Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`.
|
||||
This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument.
|
||||
|
||||
</Tip>
|
||||
|
||||
See also Marigold [usage examples](../../using-diffusers/marigold_usage).
|
||||
|
||||
## Marigold Depth Prediction API
|
||||
See also Marigold [usage examples](marigold_usage).
|
||||
|
||||
## MarigoldDepthPipeline
|
||||
[[autodoc]] MarigoldDepthPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## MarigoldNormalsPipeline
|
||||
[[autodoc]] MarigoldNormalsPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## MarigoldDepthOutput
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput
|
||||
|
||||
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth
|
||||
|
||||
## Marigold Normals Estimation API
|
||||
[[autodoc]] MarigoldNormalsPipeline
|
||||
- __call__
|
||||
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
|
||||
|
||||
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals
|
||||
|
||||
## Marigold Intrinsic Image Decomposition API
|
||||
|
||||
[[autodoc]] MarigoldIntrinsicsPipeline
|
||||
- __call__
|
||||
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_intrinsics.MarigoldIntrinsicsOutput
|
||||
|
||||
[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics
|
||||
## MarigoldNormalsOutput
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
|
||||
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
| [Latte](latte) | text2image |
|
||||
| [LEDITS++](ledits_pp) | image editing |
|
||||
| [Lumina-T2X](lumina) | text2image |
|
||||
| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
|
||||
| [Marigold](marigold) | depth |
|
||||
| [MultiDiffusion](panorama) | text2image |
|
||||
| [MusicLDM](musicldm) | text2audio |
|
||||
| [PAG](pag) | text2image |
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# CogVideoXDDIMScheduler
|
||||
|
||||
`CogVideoXDDIMScheduler` is based on [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502), specifically for CogVideoX models.
|
||||
|
||||
## CogVideoXDDIMScheduler
|
||||
|
||||
[[autodoc]] CogVideoXDDIMScheduler
|
||||
@@ -1,19 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# CogVideoXDPMScheduler
|
||||
|
||||
`CogVideoXDPMScheduler` is based on [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095), specifically for CogVideoX models.
|
||||
|
||||
## CogVideoXDPMScheduler
|
||||
|
||||
[[autodoc]] CogVideoXDPMScheduler
|
||||
@@ -1,6 +1,4 @@
|
||||
<!--
|
||||
Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -12,38 +10,31 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Marigold Computer Vision
|
||||
# Marigold Pipelines for Computer Vision Tasks
|
||||
|
||||
**Marigold** is a diffusion-based [method](https://huggingface.co/papers/2312.02145) and a collection of [pipelines](../api/pipelines/marigold) designed for
|
||||
dense computer vision tasks, including **monocular depth prediction**, **surface normals estimation**, and **intrinsic
|
||||
image decomposition**.
|
||||
[Marigold](../api/pipelines/marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.
|
||||
|
||||
This guide will walk you through using Marigold to generate fast and high-quality predictions for images and videos.
|
||||
This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos.
|
||||
|
||||
Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a
|
||||
corresponding prediction.
|
||||
Currently, the following computer vision tasks are implemented:
|
||||
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
|
||||
Currently, the following tasks are implemented:
|
||||
|
||||
| Pipeline | Recommended Model Checkpoints | Spaces (Interactive Apps) | Predicted Modalities |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1) | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) |
|
||||
| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection) |
|
||||
| Pipeline | Predicted Modalities | Demos |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
|
||||
|
||||
All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
|
||||
They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train
|
||||
new model checkpoints.
|
||||
The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps.
|
||||
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
|
||||
These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold).
|
||||
The original code can also be used to train new checkpoints.
|
||||
|
||||
| Checkpoint | Modality | Comment |
|
||||
|-----------------------------------------------------------------------------------------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | Depth | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
|
||||
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1. |
|
||||
| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity. |
|
||||
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image \\(I\\) is comprised of Albedo \\(A\\), Diffuse shading \\(S\\), and Non-diffuse residual \\(R\\): \\(I = A*S+R\\). |
|
||||
|
||||
The examples below are mostly given for depth prediction, but they can be universally applied to other supported
|
||||
modalities.
|
||||
| Checkpoint | Modality | Comment |
|
||||
|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0) | Depth | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
|
||||
| [prs-eth/marigold-depth-lcm-v1-0](https://huggingface.co/prs-eth/marigold-depth-lcm-v1-0) | Depth | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. |
|
||||
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.* |
|
||||
| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.* |
|
||||
The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities.
|
||||
We showcase the predictions using the same input image of Albert Einstein generated by Midjourney.
|
||||
This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.
|
||||
|
||||
@@ -56,21 +47,19 @@ This makes it easier to compare visualizations of the predictions across various
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Depth Prediction
|
||||
### Depth Prediction Quick Start
|
||||
|
||||
To get a depth prediction, load the `prs-eth/marigold-depth-v1-1` checkpoint into [`MarigoldDepthPipeline`],
|
||||
put the image through the pipeline, and save the predictions:
|
||||
To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_depth(depth.prediction)
|
||||
@@ -80,13 +69,10 @@ depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
|
||||
depth_16bit[0].save("einstein_depth_16bit.png")
|
||||
```
|
||||
|
||||
The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] function applies one of
|
||||
[matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]`
|
||||
depth range into an RGB image.
|
||||
With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are blue.
|
||||
The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image.
|
||||
With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
|
||||
The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
|
||||
Below are the raw and the visualized predictions. The darker and closer areas (mustache) are easier to distinguish in
|
||||
the visualization.
|
||||
Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
@@ -103,33 +89,28 @@ the visualization.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Surface Normals Estimation
|
||||
### Surface Normals Prediction Quick Start
|
||||
|
||||
Load the `prs-eth/marigold-normals-v1-1` checkpoint into [`MarigoldNormalsPipeline`], put the image through the
|
||||
pipeline, and save the predictions:
|
||||
Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
|
||||
"prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
"prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
normals = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_normals(normals.prediction)
|
||||
vis[0].save("einstein_normals.png")
|
||||
```
|
||||
|
||||
The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional
|
||||
prediction with pixel values in the range `[-1, 1]` into an RGB image.
|
||||
The visualization function supports flipping surface normals axes to make the visualization compatible with other
|
||||
choices of the frame of reference.
|
||||
Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis
|
||||
points right, `Y` axis points up, and `Z` axis points at the viewer.
|
||||
The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image.
|
||||
The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference.
|
||||
Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
|
||||
Below is the visualized prediction:
|
||||
|
||||
<div class="flex gap-4" style="justify-content: center; width: 100%;">
|
||||
@@ -141,121 +122,25 @@ Below is the visualized prediction:
|
||||
</div>
|
||||
</div>
|
||||
|
||||
In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points
|
||||
straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
|
||||
In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
|
||||
This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
|
||||
Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the
|
||||
red hue.
|
||||
Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue.
|
||||
Points on the shoulders pointing up with a large `Y` promote green color.
|
||||
|
||||
## Intrinsic Image Decomposition
|
||||
### Speeding up inference
|
||||
|
||||
Marigold provides two models for Intrinsic Image Decomposition (IID): "Appearance" and "Lighting".
|
||||
Each model produces Albedo maps, derived from InteriorVerse and Hypersim annotations, respectively.
|
||||
|
||||
- The "Appearance" model also estimates Material properties: Roughness and Metallicity.
|
||||
- The "Lighting" model generates Diffuse Shading and Non-diffuse Residual.
|
||||
|
||||
Here is the sample code saving predictions made by the "Appearance" model:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
|
||||
"prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
intrinsics = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
|
||||
vis[0]["albedo"].save("einstein_albedo.png")
|
||||
vis[0]["roughness"].save("einstein_roughness.png")
|
||||
vis[0]["metallicity"].save("einstein_metallicity.png")
|
||||
```
|
||||
|
||||
Another example demonstrating the predictions made by the "Lighting" model:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
|
||||
"prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
intrinsics = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
|
||||
vis[0]["albedo"].save("einstein_albedo.png")
|
||||
vis[0]["shading"].save("einstein_shading.png")
|
||||
vis[0]["residual"].save("einstein_residual.png")
|
||||
```
|
||||
|
||||
Both models share the same pipeline while supporting different decomposition types.
|
||||
The exact decomposition parameterization (e.g., sRGB vs. linear space) is stored in the
|
||||
`pipe.target_properties` dictionary, which is passed into the
|
||||
[`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics`] function.
|
||||
|
||||
Below are some examples showcasing the predicted decomposition outputs.
|
||||
All modalities can be inspected in the
|
||||
[Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) Space.
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_albedo.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Predicted albedo ("Appearance" model)
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_diffuse.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Predicted diffuse shading ("Lighting" model)
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Speeding up inference
|
||||
|
||||
The above quick start snippets are already optimized for quality and speed, loading the checkpoint, utilizing the
|
||||
`fp16` variant of weights and computation, and performing the default number (4) of denoising diffusion steps.
|
||||
The first step to accelerate inference, at the expense of prediction quality, is to reduce the denoising diffusion
|
||||
steps to the minimum:
|
||||
The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step.
|
||||
The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
|
||||
Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
|
||||
In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
|
||||
Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny):
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
- depth = pipe(image)
|
||||
+ depth = pipe(image, num_inference_steps=1)
|
||||
```
|
||||
|
||||
With this change, the `pipe` call completes in 280ms on RTX 3090 GPU.
|
||||
Internally, the input image is first encoded using the Stable Diffusion VAE encoder, followed by a single denoising
|
||||
step performed by the U-Net.
|
||||
Finally, the prediction latent is decoded with the VAE decoder into pixel space.
|
||||
In this setup, two out of three module calls are dedicated to converting between the pixel and latent spaces of the LDM.
|
||||
Since Marigold's latent space is compatible with Stable Diffusion 2.0, inference can be accelerated by more than 3x,
|
||||
reducing the call time to 85ms on an RTX 3090, by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny).
|
||||
Note that using a lightweight VAE may slightly reduce the visual quality of the predictions.
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
+ pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
|
||||
@@ -263,77 +148,78 @@ Note that using a lightweight VAE may slightly reduce the visual quality of the
|
||||
+ ).cuda()
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(image, num_inference_steps=1)
|
||||
depth = pipe(image)
|
||||
```
|
||||
|
||||
So far, we have optimized the number of diffusion steps and model components. Self-attention operations account for a
|
||||
significant portion of computations.
|
||||
Speeding them up can be achieved by using a more efficient attention processor:
|
||||
As suggested in [Optimizations](../optimization/torch2.0#torch.compile), adding `torch.compile` may squeeze extra performance depending on the target hardware:
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
+ from diffusers.models.attention_processor import AttnProcessor2_0
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
+ pipe.vae.set_attn_processor(AttnProcessor2_0())
|
||||
+ pipe.unet.set_attn_processor(AttnProcessor2_0())
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(image, num_inference_steps=1)
|
||||
```
|
||||
|
||||
Finally, as suggested in [Optimizations](../optimization/torch2.0#torch.compile), enabling `torch.compile` can further enhance performance depending on
|
||||
the target hardware.
|
||||
However, compilation incurs a significant overhead during the first pipeline invocation, making it beneficial only when
|
||||
the same pipeline instance is called repeatedly, such as within a loop.
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
from diffusers.models.attention_processor import AttnProcessor2_0
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipe.vae.set_attn_processor(AttnProcessor2_0())
|
||||
pipe.unet.set_attn_processor(AttnProcessor2_0())
|
||||
|
||||
+ pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
|
||||
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(image, num_inference_steps=1)
|
||||
depth = pipe(image)
|
||||
```
|
||||
|
||||
## Qualitative Comparison with Depth Anything
|
||||
|
||||
With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Marigold LCM fp16 with Tiny AutoEncoder
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/einstein_depthanything_large.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth Anything Large
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Maximizing Precision and Ensembling
|
||||
|
||||
Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents.
|
||||
This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
|
||||
The ensembling path is activated automatically when the `ensemble_size` argument is set greater or equal than `3`.
|
||||
The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`.
|
||||
When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`.
|
||||
The recommended values vary across checkpoints but primarily depend on the scheduler type.
|
||||
The effect of ensembling is particularly well-seen with surface normals:
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
```python
|
||||
import diffusers
|
||||
|
||||
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained("prs-eth/marigold-normals-v1-1").to("cuda")
|
||||
model_path = "prs-eth/marigold-normals-v1-0"
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
model_paper_kwargs = {
|
||||
diffusers.schedulers.DDIMScheduler: {
|
||||
"num_inference_steps": 10,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
diffusers.schedulers.LCMScheduler: {
|
||||
"num_inference_steps": 4,
|
||||
"ensemble_size": 5,
|
||||
},
|
||||
}
|
||||
|
||||
- depth = pipe(image)
|
||||
+ depth = pipe(image, num_inference_steps=10, ensemble_size=5)
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
vis = pipe.image_processor.visualize_normals(depth.prediction)
|
||||
vis[0].save("einstein_normals.png")
|
||||
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
|
||||
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
|
||||
|
||||
depth = pipe(image, **pipe_kwargs)
|
||||
|
||||
vis = pipe.image_processor.visualize_normals(depth.prediction)
|
||||
vis[0].save("einstein_normals.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -351,16 +237,93 @@ The effect of ensembling is particularly well-seen with surface normals:
|
||||
</div>
|
||||
</div>
|
||||
|
||||
As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more
|
||||
correct predictions.
|
||||
As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
|
||||
Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.
|
||||
|
||||
## Quantitative Evaluation
|
||||
|
||||
To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`.
|
||||
Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
device = "cuda"
|
||||
seed = 2024
|
||||
model_path = "prs-eth/marigold-v1-0"
|
||||
|
||||
model_paper_kwargs = {
|
||||
diffusers.schedulers.DDIMScheduler: {
|
||||
"num_inference_steps": 50,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
diffusers.schedulers.LCMScheduler: {
|
||||
"num_inference_steps": 4,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
}
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
|
||||
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
|
||||
|
||||
depth = pipe(image, generator=generator, **pipe_kwargs)
|
||||
|
||||
# evaluate metrics
|
||||
```
|
||||
|
||||
## Using Predictive Uncertainty
|
||||
|
||||
The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents.
|
||||
As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
|
||||
The resulting uncertainty will be available in the `uncertainty` field of the output.
|
||||
It can be visualized as follows:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
depth = pipe(
|
||||
image,
|
||||
ensemble_size=10, # any number greater than 1; higher values yield higher precision
|
||||
output_uncertainty=True,
|
||||
)
|
||||
|
||||
uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
|
||||
uncertainty[0].save("einstein_depth_uncertainty.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Surface normals uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
|
||||
Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
|
||||
The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
|
||||
|
||||
## Frame-by-frame Video Processing with Temporal Consistency
|
||||
|
||||
Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent
|
||||
initialization.
|
||||
This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the
|
||||
following videos:
|
||||
Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization.
|
||||
This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
@@ -373,32 +336,26 @@ following videos:
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of
|
||||
diffusion.
|
||||
Empirically, we found that a convex combination of the very same starting point noise latent and the latent
|
||||
corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
|
||||
To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
|
||||
Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
|
||||
|
||||
```python
|
||||
import imageio
|
||||
import diffusers
|
||||
import torch
|
||||
from diffusers.models.attention_processor import AttnProcessor2_0
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
device = "cuda"
|
||||
path_in = "https://huggingface.co/spaces/prs-eth/marigold-lcm/resolve/c7adb5427947d2680944f898cd91d386bf0d4924/files/video/obama.mp4"
|
||||
path_in = "obama.mp4"
|
||||
path_out = "obama_depth.gif"
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
|
||||
"madebyollin/taesd", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
pipe.unet.set_attn_processor(AttnProcessor2_0())
|
||||
pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
pipe.set_progress_bar_config(disable=True)
|
||||
|
||||
with imageio.get_reader(path_in) as reader:
|
||||
@@ -416,11 +373,7 @@ with imageio.get_reader(path_in) as reader:
|
||||
latents = 0.9 * latents + 0.1 * last_frame_latent
|
||||
|
||||
depth = pipe(
|
||||
frame,
|
||||
num_inference_steps=1,
|
||||
match_input_resolution=False,
|
||||
latents=latents,
|
||||
output_latent=True,
|
||||
frame, match_input_resolution=False, latents=latents, output_latent=True
|
||||
)
|
||||
last_frame_latent = depth.latent
|
||||
out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
|
||||
@@ -429,8 +382,7 @@ with imageio.get_reader(path_in) as reader:
|
||||
```
|
||||
|
||||
Here, the diffusion process starts from the given computed latent.
|
||||
The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent
|
||||
initialization.
|
||||
The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
|
||||
The result is much more stable now:
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -462,7 +414,7 @@ image = diffusers.utils.load_image(
|
||||
)
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", torch_dtype=torch.float16, variant="fp16"
|
||||
"prs-eth/marigold-depth-lcm-v1-0", torch_dtype=torch.float16, variant="fp16"
|
||||
).to(device)
|
||||
|
||||
depth_image = pipe(image, generator=generator).prediction
|
||||
@@ -511,95 +463,4 @@ controlnet_out[0].save("motorcycle_controlnet_out.png")
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Quantitative Evaluation
|
||||
|
||||
To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets),
|
||||
follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values
|
||||
for `num_inference_steps` and `ensemble_size`.
|
||||
Optionally seed randomness to ensure reproducibility.
|
||||
Maximizing `batch_size` will deliver maximum device utilization.
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
device = "cuda"
|
||||
seed = 2024
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained("prs-eth/marigold-depth-v1-1").to(device)
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(
|
||||
image,
|
||||
num_inference_steps=4, # set according to the evaluation protocol from the paper
|
||||
ensemble_size=10, # set according to the evaluation protocol from the paper
|
||||
generator=generator,
|
||||
)
|
||||
|
||||
# evaluate metrics
|
||||
```
|
||||
|
||||
## Using Predictive Uncertainty
|
||||
|
||||
The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random
|
||||
latents.
|
||||
As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater
|
||||
or equal than 3 and set `output_uncertainty=True`.
|
||||
The resulting uncertainty will be available in the `uncertainty` field of the output.
|
||||
It can be visualized as follows:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
depth = pipe(
|
||||
image,
|
||||
ensemble_size=10, # any number >= 3
|
||||
output_uncertainty=True,
|
||||
)
|
||||
|
||||
uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
|
||||
uncertainty[0].save("einstein_depth_uncertainty.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Surface normals uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/4f83035d84a24e5ec44fdda129b1d51eba12ce04/marigold/marigold_einstein_albedo_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Albedo uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to
|
||||
make consistent predictions.
|
||||
- The depth model exhibits the most uncertainty around discontinuities, where object depth changes abruptly.
|
||||
- The surface normals model is least confident in fine-grained structures like hair and in dark regions such as the
|
||||
collar area.
|
||||
- Albedo uncertainty is represented as an RGB image, as it captures uncertainty independently for each color channel,
|
||||
unlike depth and surface normals. It is also higher in shaded regions and at discontinuities.
|
||||
|
||||
## Conclusion
|
||||
|
||||
We hope Marigold proves valuable for your downstream tasks, whether as part of a broader generative workflow or for
|
||||
perception-based applications like 3D reconstruction.
|
||||
Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a perception task, such as 3D reconstruction.
|
||||
|
||||
@@ -215,7 +215,7 @@ image
|
||||
|
||||
Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
|
||||
|
||||
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
|
||||
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -223,99 +223,136 @@ If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open
|
||||
|
||||
</Tip>
|
||||
|
||||
This guide will show you how to weight your prompts with sd_embed.
|
||||
This guide will show you how to weight and blend your prompts with Compel in 🤗 Diffusers.
|
||||
|
||||
Before you begin, make sure you have the latest version of sd_embed installed:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/xhinker/sd_embed.git@main
|
||||
```
|
||||
|
||||
For this example, let's use [`StableDiffusionXLPipeline`].
|
||||
Before you begin, make sure you have the latest version of Compel installed:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline, UniPCMultistepScheduler
|
||||
# uncomment to install in Colab
|
||||
#!pip install compel --upgrade
|
||||
```
|
||||
|
||||
For this guide, let's generate an image with the prompt `"a red cat playing with a ball"` using the [`StableDiffusionPipeline`]:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
|
||||
import torch
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained("Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_safetensors=True)
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.to("cuda")
|
||||
```
|
||||
|
||||
To upweight or downweight a concept, surround the text with parentheses. More parentheses applies a heavier weight on the text. You can also append a numerical multiplier to the text to indicate how much you want to increase or decrease its weights by.
|
||||
prompt = "a red cat playing with a ball"
|
||||
|
||||
| format | multiplier |
|
||||
|---|---|
|
||||
| `(hippo)` | increase by 1.1x |
|
||||
| `((hippo))` | increase by 1.21x |
|
||||
| `(hippo:1.5)` | increase by 1.5x |
|
||||
| `(hippo:0.5)` | decrease by 4x |
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
|
||||
Create a prompt and use a combination of parentheses and numerical multipliers to upweight various text.
|
||||
|
||||
```py
|
||||
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sdxl
|
||||
|
||||
prompt = """A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
|
||||
This imaginative creature features the distinctive, bulky body of a hippo,
|
||||
but with a texture and appearance resembling a golden-brown, crispy waffle.
|
||||
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
|
||||
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
|
||||
possibly including oversized utensils or plates in the background.
|
||||
The image should evoke a sense of playful absurdity and culinary fantasy.
|
||||
"""
|
||||
|
||||
neg_prompt = """\
|
||||
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
|
||||
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
|
||||
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
|
||||
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
|
||||
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
|
||||
(normal quality:2),lowres,((monochrome)),((grayscale))
|
||||
"""
|
||||
```
|
||||
|
||||
Use the `get_weighted_text_embeddings_sdxl` function to generate the prompt embeddings and the negative prompt embeddings. It'll also generated the pooled and negative pooled prompt embeddings since you're using the SDXL model.
|
||||
|
||||
> [!TIP]
|
||||
> You can safely ignore the error message below about the token index length exceeding the models maximum sequence length. All your tokens will be used in the embedding process.
|
||||
>
|
||||
> ```
|
||||
> Token indices sequence length is longer than the specified maximum sequence length for this model
|
||||
> ```
|
||||
|
||||
```py
|
||||
(
|
||||
prompt_embeds,
|
||||
prompt_neg_embeds,
|
||||
pooled_prompt_embeds,
|
||||
negative_pooled_prompt_embeds
|
||||
) = get_weighted_text_embeddings_sdxl(
|
||||
pipe,
|
||||
prompt=prompt,
|
||||
neg_prompt=neg_prompt
|
||||
)
|
||||
|
||||
image = pipe(
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=prompt_neg_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
||||
num_inference_steps=30,
|
||||
height=1024,
|
||||
width=1024 + 512,
|
||||
guidance_scale=4.0,
|
||||
generator=torch.Generator("cuda").manual_seed(2)
|
||||
).images[0]
|
||||
image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_sdxl.png"/>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
|
||||
</div>
|
||||
|
||||
> [!TIP]
|
||||
> Refer to the [sd_embed](https://github.com/xhinker/sd_embed) repository for additional details about long prompt weighting for FLUX.1, Stable Cascade, and Stable Diffusion 1.5.
|
||||
### Weighting
|
||||
|
||||
You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:
|
||||
|
||||
```py
|
||||
from compel import Compel
|
||||
|
||||
compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
|
||||
```
|
||||
|
||||
compel uses `+` or `-` to increase or decrease the weight of a word in the prompt. To increase the weight of "ball":
|
||||
|
||||
<Tip>
|
||||
|
||||
`+` corresponds to the value `1.1`, `++` corresponds to `1.1^2`, and so on. Similarly, `-` corresponds to `0.9` and `--` corresponds to `0.9^2`. Feel free to experiment with adding more `+` or `-` in your prompt!
|
||||
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
prompt = "a red cat playing with a ball++"
|
||||
```
|
||||
|
||||
Pass the prompt to `compel_proc` to create the new prompt embeddings which are passed to the pipeline:
|
||||
|
||||
```py
|
||||
prompt_embeds = compel_proc(prompt)
|
||||
generator = torch.manual_seed(33)
|
||||
|
||||
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_1.png"/>
|
||||
</div>
|
||||
|
||||
To downweight parts of the prompt, use the `-` suffix:
|
||||
|
||||
```py
|
||||
prompt = "a red------- cat playing with a ball"
|
||||
prompt_embeds = compel_proc(prompt)
|
||||
|
||||
generator = torch.manual_seed(33)
|
||||
|
||||
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"/>
|
||||
</div>
|
||||
|
||||
You can even up or downweight multiple concepts in the same prompt:
|
||||
|
||||
```py
|
||||
prompt = "a red cat++ playing with a ball----"
|
||||
prompt_embeds = compel_proc(prompt)
|
||||
|
||||
generator = torch.manual_seed(33)
|
||||
|
||||
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
|
||||
</div>
|
||||
|
||||
### Blending
|
||||
|
||||
You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!
|
||||
|
||||
```py
|
||||
prompt_embeds = compel_proc('("a red cat playing with a ball", "jungle").blend(0.7, 0.8)')
|
||||
generator = torch.Generator(device="cuda").manual_seed(33)
|
||||
|
||||
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
|
||||
</div>
|
||||
|
||||
### Conjunction
|
||||
|
||||
A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:
|
||||
|
||||
```py
|
||||
prompt_embeds = compel_proc('["a red cat", "playing with a", "ball"].and()')
|
||||
generator = torch.Generator(device="cuda").manual_seed(55)
|
||||
|
||||
image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
|
||||
</div>
|
||||
|
||||
### Textual inversion
|
||||
|
||||
@@ -326,63 +363,35 @@ Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textua
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from compel import Compel, DiffusersTextualInversionManager
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16,
|
||||
use_safetensors=True, variant="fp16").to("cuda")
|
||||
pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
|
||||
```
|
||||
|
||||
Add the `<midjourney-style>` text to the prompt to trigger the textual inversion.
|
||||
Compel provides a `DiffusersTextualInversionManager` class to simplify prompt weighting with textual inversion. Instantiate `DiffusersTextualInversionManager` and pass it to the `Compel` class:
|
||||
|
||||
```py
|
||||
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
|
||||
|
||||
prompt = """<midjourney-style> A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
|
||||
This imaginative creature features the distinctive, bulky body of a hippo,
|
||||
but with a texture and appearance resembling a golden-brown, crispy waffle.
|
||||
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
|
||||
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
|
||||
possibly including oversized utensils or plates in the background.
|
||||
The image should evoke a sense of playful absurdity and culinary fantasy.
|
||||
"""
|
||||
|
||||
neg_prompt = """\
|
||||
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
|
||||
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
|
||||
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
|
||||
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
|
||||
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
|
||||
(normal quality:2),lowres,((monochrome)),((grayscale))
|
||||
"""
|
||||
textual_inversion_manager = DiffusersTextualInversionManager(pipe)
|
||||
compel_proc = Compel(
|
||||
tokenizer=pipe.tokenizer,
|
||||
text_encoder=pipe.text_encoder,
|
||||
textual_inversion_manager=textual_inversion_manager)
|
||||
```
|
||||
|
||||
Use the `get_weighted_text_embeddings_sd15` function to generate the prompt embeddings and the negative prompt embeddings.
|
||||
Incorporate the concept to condition a prompt with using the `<concept>` syntax:
|
||||
|
||||
```py
|
||||
(
|
||||
prompt_embeds,
|
||||
prompt_neg_embeds,
|
||||
) = get_weighted_text_embeddings_sd15(
|
||||
pipe,
|
||||
prompt=prompt,
|
||||
neg_prompt=neg_prompt
|
||||
)
|
||||
prompt_embeds = compel_proc('("A red cat++ playing with a ball <midjourney-style>")')
|
||||
|
||||
image = pipe(
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=prompt_neg_embeds,
|
||||
height=768,
|
||||
width=896,
|
||||
guidance_scale=4.0,
|
||||
generator=torch.Generator("cuda").manual_seed(2)
|
||||
).images[0]
|
||||
image = pipe(prompt_embeds=prompt_embeds).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_textual_inversion.png"/>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
|
||||
</div>
|
||||
|
||||
### DreamBooth
|
||||
@@ -392,44 +401,70 @@ image
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, UniPCMultistepScheduler
|
||||
from compel import Compel
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
```
|
||||
|
||||
Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
|
||||
Create a `Compel` class with a tokenizer and text encoder, and pass your prompt to it. Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
|
||||
|
||||
```py
|
||||
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
|
||||
|
||||
prompt = """dndcoverart of A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
|
||||
This imaginative creature features the distinctive, bulky body of a hippo,
|
||||
but with a texture and appearance resembling a golden-brown, crispy waffle.
|
||||
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
|
||||
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
|
||||
possibly including oversized utensils or plates in the background.
|
||||
The image should evoke a sense of playful absurdity and culinary fantasy.
|
||||
"""
|
||||
|
||||
neg_prompt = """\
|
||||
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
|
||||
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
|
||||
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
|
||||
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
|
||||
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
|
||||
(normal quality:2),lowres,((monochrome)),((grayscale))
|
||||
"""
|
||||
|
||||
(
|
||||
prompt_embeds
|
||||
, prompt_neg_embeds
|
||||
) = get_weighted_text_embeddings_sd15(
|
||||
pipe
|
||||
, prompt = prompt
|
||||
, neg_prompt = neg_prompt
|
||||
)
|
||||
compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
|
||||
prompt_embeds = compel_proc('("magazine cover of a dndcoverart dragon, high quality, intricate details, larry elmore art style").and()')
|
||||
image = pipe(prompt_embeds=prompt_embeds).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_dreambooth.png"/>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
|
||||
</div>
|
||||
|
||||
### Stable Diffusion XL
|
||||
|
||||
Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:
|
||||
|
||||
```py
|
||||
from compel import Compel, ReturnedEmbeddingsType
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import make_image_grid
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
compel = Compel(
|
||||
tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
|
||||
text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
|
||||
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
|
||||
requires_pooled=[False, True]
|
||||
)
|
||||
```
|
||||
|
||||
This time, let's upweight "ball" by a factor of 1.5 for the first prompt, and downweight "ball" by 0.6 for the second prompt. The [`StableDiffusionXLPipeline`] also requires [`pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.pooled_prompt_embeds) (and optionally [`negative_pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.negative_pooled_prompt_embeds)) so you should pass those to the pipeline along with the conditioning tensors:
|
||||
|
||||
```py
|
||||
# apply weights
|
||||
prompt = ["a red cat playing with a (ball)1.5", "a red cat playing with a (ball)0.6"]
|
||||
conditioning, pooled = compel(prompt)
|
||||
|
||||
# generate image
|
||||
generator = [torch.Generator().manual_seed(33) for _ in range(len(prompt))]
|
||||
images = pipeline(prompt_embeds=conditioning, pooled_prompt_embeds=pooled, generator=generator, num_inference_steps=30).images
|
||||
make_image_grid(images, rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)1.5"</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)0.6"</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -92,13 +92,9 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
token = kwargs.pop("token", None)
|
||||
variant = kwargs.pop("variant", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
device_map = kwargs.pop("device_map", None)
|
||||
|
||||
if not isinstance(torch_dtype, torch.dtype):
|
||||
torch_dtype = torch.float32
|
||||
print(f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`.")
|
||||
|
||||
alpha = kwargs.pop("alpha", 0.5)
|
||||
interp = kwargs.pop("interp", None)
|
||||
|
||||
|
||||
@@ -203,7 +203,7 @@ def log_validation(
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
|
||||
|
||||
pipeline = pipeline.to(accelerator.device)
|
||||
pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
|
||||
pipeline.set_progress_bar_config(disable=True)
|
||||
|
||||
# run inference
|
||||
@@ -213,7 +213,7 @@ def log_validation(
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
|
||||
autocast_ctx = nullcontext()
|
||||
else:
|
||||
autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
|
||||
autocast_ctx = torch.autocast(accelerator.device.type)
|
||||
|
||||
with autocast_ctx:
|
||||
images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
|
||||
|
||||
@@ -345,7 +345,6 @@ else:
|
||||
"Lumina2Text2ImgPipeline",
|
||||
"LuminaText2ImgPipeline",
|
||||
"MarigoldDepthPipeline",
|
||||
"MarigoldIntrinsicsPipeline",
|
||||
"MarigoldNormalsPipeline",
|
||||
"MochiPipeline",
|
||||
"MusicLDMPipeline",
|
||||
@@ -846,7 +845,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
Lumina2Text2ImgPipeline,
|
||||
LuminaText2ImgPipeline,
|
||||
MarigoldDepthPipeline,
|
||||
MarigoldIntrinsicsPipeline,
|
||||
MarigoldNormalsPipeline,
|
||||
MochiPipeline,
|
||||
MusicLDMPipeline,
|
||||
@@ -867,7 +865,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
StableCascadeCombinedPipeline,
|
||||
StableCascadeDecoderPipeline,
|
||||
StableCascadePriorPipeline,
|
||||
StableDiffusion3ControlNetInpaintingPipeline,
|
||||
StableDiffusion3ControlNetPipeline,
|
||||
StableDiffusion3Img2ImgPipeline,
|
||||
StableDiffusion3InpaintPipeline,
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from ..utils import get_logger
|
||||
from ._common import _BATCHED_INPUT_IDENTIFIERS
|
||||
from .hooks import HookRegistry, ModelHook
|
||||
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
_CFG_PARALLEL = "cfg_parallel"
|
||||
|
||||
|
||||
class CFGParallelHook(ModelHook):
|
||||
def initialize_hook(self, module):
|
||||
if not dist.is_initialized():
|
||||
raise RuntimeError("Distributed environment not initialized.")
|
||||
return module
|
||||
|
||||
def new_forward(self, module: torch.nn.Module, *args, **kwargs):
|
||||
if len(args) > 0:
|
||||
logger.warning(
|
||||
"CFGParallelHook is an example hook that does not work with batched positional arguments. Please use with caution."
|
||||
)
|
||||
|
||||
world_size = dist.get_world_size()
|
||||
rank = dist.get_rank()
|
||||
|
||||
assert world_size == 2, "This is an example hook designed to only work with 2 processes."
|
||||
|
||||
for key in list(kwargs.keys()):
|
||||
if key not in _BATCHED_INPUT_IDENTIFIERS or kwargs[key] is None:
|
||||
continue
|
||||
kwargs[key] = torch.chunk(kwargs[key], world_size, dim=0)[rank].contiguous()
|
||||
|
||||
output = self.fn_ref.original_forward(*args, **kwargs)
|
||||
sample = output[0]
|
||||
sample_list = [torch.empty_like(sample) for _ in range(world_size)]
|
||||
dist.all_gather(sample_list, sample)
|
||||
sample = torch.cat(sample_list, dim=0).contiguous()
|
||||
|
||||
return_dict = kwargs.get("return_dict", False)
|
||||
if not return_dict:
|
||||
return (sample, *output[1:])
|
||||
return output.__class__(sample, *output[1:])
|
||||
|
||||
|
||||
def apply_cfg_parallel(module: torch.nn.Module) -> None:
|
||||
registry = HookRegistry.check_if_exists_or_initialize(module)
|
||||
hook = CFGParallelHook()
|
||||
registry.register_hook(hook, _CFG_PARALLEL)
|
||||
@@ -0,0 +1,26 @@
|
||||
from ..models.attention_processor import Attention, MochiAttention
|
||||
|
||||
|
||||
_ATTENTION_CLASSES = (Attention, MochiAttention)
|
||||
|
||||
_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
|
||||
_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
|
||||
_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "layers")
|
||||
|
||||
_ALL_TRANSFORMER_BLOCK_IDENTIFIERS = tuple(
|
||||
{
|
||||
*_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
*_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
*_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
}
|
||||
)
|
||||
|
||||
_BATCHED_INPUT_IDENTIFIERS = (
|
||||
"hidden_states",
|
||||
"encoder_hidden_states",
|
||||
"pooled_projections",
|
||||
"timestep",
|
||||
"attention_mask",
|
||||
"encoder_attention_mask",
|
||||
"guidance",
|
||||
)
|
||||
@@ -20,19 +20,18 @@ import torch
|
||||
|
||||
from ..models.attention_processor import Attention, MochiAttention
|
||||
from ..utils import logging
|
||||
from ._common import (
|
||||
_ATTENTION_CLASSES,
|
||||
_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
|
||||
)
|
||||
from .hooks import HookRegistry, ModelHook
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
_ATTENTION_CLASSES = (Attention, MochiAttention)
|
||||
|
||||
_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks")
|
||||
_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
|
||||
_CROSS_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PyramidAttentionBroadcastConfig:
|
||||
r"""
|
||||
@@ -76,9 +75,9 @@ class PyramidAttentionBroadcastConfig:
|
||||
temporal_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
|
||||
cross_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
|
||||
|
||||
spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS
|
||||
temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
|
||||
cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_ATTENTION_BLOCK_IDENTIFIERS
|
||||
spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS
|
||||
temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS
|
||||
cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS
|
||||
|
||||
current_timestep_callback: Callable[[], int] = None
|
||||
|
||||
|
||||
@@ -23,9 +23,7 @@ from safetensors import safe_open
|
||||
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
|
||||
from ..utils import (
|
||||
USE_PEFT_BACKEND,
|
||||
_get_detailed_type,
|
||||
_get_model_file,
|
||||
_is_valid_type,
|
||||
is_accelerate_available,
|
||||
is_torch_version,
|
||||
is_transformers_available,
|
||||
@@ -579,36 +577,29 @@ class FluxIPAdapterMixin:
|
||||
pipeline.set_ip_adapter_scale(ip_strengths)
|
||||
```
|
||||
"""
|
||||
|
||||
scale_type = Union[int, float]
|
||||
num_ip_adapters = self.transformer.encoder_hid_proj.num_ip_adapters
|
||||
num_layers = self.transformer.config.num_layers
|
||||
|
||||
# Single value for all layers of all IP-Adapters
|
||||
if isinstance(scale, scale_type):
|
||||
scale = [scale for _ in range(num_ip_adapters)]
|
||||
# List of per-layer scales for a single IP-Adapter
|
||||
elif _is_valid_type(scale, List[scale_type]) and num_ip_adapters == 1:
|
||||
transformer = self.transformer
|
||||
if not isinstance(scale, list):
|
||||
scale = [[scale] * transformer.config.num_layers]
|
||||
elif isinstance(scale, list) and isinstance(scale[0], int) or isinstance(scale[0], float):
|
||||
if len(scale) != transformer.config.num_layers:
|
||||
raise ValueError(f"Expected list of {transformer.config.num_layers} scales, got {len(scale)}.")
|
||||
scale = [scale]
|
||||
# Invalid scale type
|
||||
elif not _is_valid_type(scale, List[Union[scale_type, List[scale_type]]]):
|
||||
raise TypeError(f"Unexpected type {_get_detailed_type(scale)} for scale.")
|
||||
|
||||
if len(scale) != num_ip_adapters:
|
||||
raise ValueError(f"Cannot assign {len(scale)} scales to {num_ip_adapters} IP-Adapters.")
|
||||
scale_configs = scale
|
||||
|
||||
if any(len(s) != num_layers for s in scale if isinstance(s, list)):
|
||||
invalid_scale_sizes = {len(s) for s in scale if isinstance(s, list)} - {num_layers}
|
||||
raise ValueError(
|
||||
f"Expected list of {num_layers} scales, got {', '.join(str(x) for x in invalid_scale_sizes)}."
|
||||
)
|
||||
|
||||
# Scalars are transformed to lists with length num_layers
|
||||
scale_configs = [[s] * num_layers if isinstance(s, scale_type) else s for s in scale]
|
||||
|
||||
# Set scales. zip over scale_configs prevents going into single transformer layers
|
||||
for attn_processor, *scale in zip(self.transformer.attn_processors.values(), *scale_configs):
|
||||
attn_processor.scale = scale
|
||||
key_id = 0
|
||||
for attn_name, attn_processor in transformer.attn_processors.items():
|
||||
if isinstance(attn_processor, (FluxIPAdapterJointAttnProcessor2_0)):
|
||||
if len(scale_configs) != len(attn_processor.scale):
|
||||
raise ValueError(
|
||||
f"Cannot assign {len(scale_configs)} scale_configs to "
|
||||
f"{len(attn_processor.scale)} IP-Adapter."
|
||||
)
|
||||
elif len(scale_configs) == 1:
|
||||
scale_configs = scale_configs * len(attn_processor.scale)
|
||||
for i, scale_config in enumerate(scale_configs):
|
||||
attn_processor.scale[i] = scale_config[key_id]
|
||||
key_id += 1
|
||||
|
||||
def unload_ip_adapter(self):
|
||||
"""
|
||||
|
||||
@@ -63,9 +63,6 @@ def _maybe_adjust_config(config):
|
||||
method removes the ambiguity by following what is described here:
|
||||
https://github.com/huggingface/diffusers/pull/9985#issuecomment-2493840028.
|
||||
"""
|
||||
# Track keys that have been explicitly removed to prevent re-adding them.
|
||||
deleted_keys = set()
|
||||
|
||||
rank_pattern = config["rank_pattern"].copy()
|
||||
target_modules = config["target_modules"]
|
||||
original_r = config["r"]
|
||||
@@ -83,22 +80,21 @@ def _maybe_adjust_config(config):
|
||||
ambiguous_key = key
|
||||
|
||||
if exact_matches and substring_matches:
|
||||
# if ambiguous, update the rank associated with the ambiguous key (`proj_out`, for example)
|
||||
# if ambiguous we update the rank associated with the ambiguous key (`proj_out`, for example)
|
||||
config["r"] = key_rank
|
||||
# remove the ambiguous key from `rank_pattern` and record it as deleted
|
||||
# remove the ambiguous key from `rank_pattern` and update its rank to `r`, instead
|
||||
del config["rank_pattern"][key]
|
||||
deleted_keys.add(key)
|
||||
# For substring matches, add them with the original rank only if they haven't been assigned already
|
||||
for mod in substring_matches:
|
||||
if mod not in config["rank_pattern"] and mod not in deleted_keys:
|
||||
# avoid overwriting if the module already has a specific rank
|
||||
if mod not in config["rank_pattern"]:
|
||||
config["rank_pattern"][mod] = original_r
|
||||
|
||||
# Update the rest of the target modules with the original rank if not already set and not deleted
|
||||
# update the rest of the keys with the `original_r`
|
||||
for mod in target_modules:
|
||||
if mod != ambiguous_key and mod not in config["rank_pattern"] and mod not in deleted_keys:
|
||||
if mod != ambiguous_key and mod not in config["rank_pattern"]:
|
||||
config["rank_pattern"][mod] = original_r
|
||||
|
||||
# Handle alphas to deal with cases like:
|
||||
# handle alphas to deal with cases like
|
||||
# https://github.com/huggingface/diffusers/pull/9999#issuecomment-2516180777
|
||||
has_different_ranks = len(config["rank_pattern"]) > 1 and list(config["rank_pattern"])[0] != config["r"]
|
||||
if has_different_ranks:
|
||||
@@ -191,11 +187,6 @@ class PeftAdapterMixin:
|
||||
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer
|
||||
|
||||
try:
|
||||
from peft.utils.constants import FULLY_QUALIFIED_PATTERN_KEY_PREFIX
|
||||
except ImportError:
|
||||
FULLY_QUALIFIED_PATTERN_KEY_PREFIX = None
|
||||
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
@@ -260,22 +251,14 @@ class PeftAdapterMixin:
|
||||
# Cannot figure out rank from lora layers that don't have atleast 2 dimensions.
|
||||
# Bias layers in LoRA only have a single dimension
|
||||
if "lora_B" in key and val.ndim > 1:
|
||||
# Support to handle cases where layer patterns are treated as full layer names
|
||||
# was added later in PEFT. So, we handle it accordingly.
|
||||
# TODO: when we fix the minimal PEFT version for Diffusers,
|
||||
# we should remove `_maybe_adjust_config()`.
|
||||
if FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
|
||||
rank[f"{FULLY_QUALIFIED_PATTERN_KEY_PREFIX}{key}"] = val.shape[1]
|
||||
else:
|
||||
rank[key] = val.shape[1]
|
||||
rank[key] = val.shape[1]
|
||||
|
||||
if network_alphas is not None and len(network_alphas) >= 1:
|
||||
alpha_keys = [k for k in network_alphas.keys() if k.startswith(f"{prefix}.")]
|
||||
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
|
||||
if not FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
|
||||
lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
|
||||
lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
|
||||
@@ -360,17 +360,11 @@ class FromSingleFileMixin:
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
disable_mmap = kwargs.pop("disable_mmap", False)
|
||||
|
||||
is_legacy_loading = False
|
||||
|
||||
if not isinstance(torch_dtype, torch.dtype):
|
||||
torch_dtype = torch.float32
|
||||
logger.warning(
|
||||
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
|
||||
)
|
||||
|
||||
# We shouldn't allow configuring individual models components through a Pipeline creation method
|
||||
# These model kwargs should be deprecated
|
||||
scaling_factor = kwargs.get("scaling_factor", None)
|
||||
|
||||
@@ -240,17 +240,11 @@ class FromOriginalModelMixin:
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
config_revision = kwargs.pop("config_revision", None)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
quantization_config = kwargs.pop("quantization_config", None)
|
||||
device = kwargs.pop("device", None)
|
||||
disable_mmap = kwargs.pop("disable_mmap", False)
|
||||
|
||||
if not isinstance(torch_dtype, torch.dtype):
|
||||
torch_dtype = torch.float32
|
||||
logger.warning(
|
||||
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
|
||||
)
|
||||
|
||||
if isinstance(pretrained_model_link_or_path_or_dict, dict):
|
||||
checkpoint = pretrained_model_link_or_path_or_dict
|
||||
else:
|
||||
|
||||
@@ -213,9 +213,7 @@ class Attention(nn.Module):
|
||||
self.norm_q = LpNorm(p=2, dim=-1, eps=eps)
|
||||
self.norm_k = LpNorm(p=2, dim=-1, eps=eps)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
|
||||
)
|
||||
raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")
|
||||
|
||||
if cross_attention_norm is None:
|
||||
self.norm_cross = None
|
||||
@@ -1410,7 +1408,7 @@ class JointAttnProcessor2_0:
|
||||
|
||||
def __init__(self):
|
||||
if not hasattr(F, "scaled_dot_product_attention"):
|
||||
raise ImportError("JointAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
||||
raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@@ -2780,8 +2778,9 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
|
||||
|
||||
# IP-adapter
|
||||
ip_query = hidden_states_query_proj
|
||||
ip_attn_output = torch.zeros_like(hidden_states)
|
||||
|
||||
ip_attn_output = None
|
||||
# for ip-adapter
|
||||
# TODO: support for multiple adapters
|
||||
for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
|
||||
ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
|
||||
):
|
||||
@@ -2792,14 +2791,12 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
|
||||
ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
# the output of sdp = (batch, num_heads, seq_len, head_dim)
|
||||
# TODO: add support for attn.scale when we move to Torch 2.1
|
||||
current_ip_hidden_states = F.scaled_dot_product_attention(
|
||||
ip_attn_output = F.scaled_dot_product_attention(
|
||||
ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
|
||||
)
|
||||
current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
|
||||
batch_size, -1, attn.heads * head_dim
|
||||
)
|
||||
current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
|
||||
ip_attn_output += scale * current_ip_hidden_states
|
||||
ip_attn_output = ip_attn_output.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
||||
ip_attn_output = scale * ip_attn_output
|
||||
ip_attn_output = ip_attn_output.to(ip_query.dtype)
|
||||
|
||||
return hidden_states, encoder_hidden_states, ip_attn_output
|
||||
else:
|
||||
|
||||
@@ -40,48 +40,6 @@ class SD3ControlNetOutput(BaseOutput):
|
||||
|
||||
|
||||
class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
|
||||
r"""
|
||||
ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
|
||||
|
||||
Parameters:
|
||||
sample_size (`int`, defaults to `128`):
|
||||
The width/height of the latents. This is fixed during training since it is used to learn a number of
|
||||
position embeddings.
|
||||
patch_size (`int`, defaults to `2`):
|
||||
Patch size to turn the input data into small patches.
|
||||
in_channels (`int`, defaults to `16`):
|
||||
The number of latent channels in the input.
|
||||
num_layers (`int`, defaults to `18`):
|
||||
The number of layers of transformer blocks to use.
|
||||
attention_head_dim (`int`, defaults to `64`):
|
||||
The number of channels in each head.
|
||||
num_attention_heads (`int`, defaults to `18`):
|
||||
The number of heads to use for multi-head attention.
|
||||
joint_attention_dim (`int`, defaults to `4096`):
|
||||
The embedding dimension to use for joint text-image attention.
|
||||
caption_projection_dim (`int`, defaults to `1152`):
|
||||
The embedding dimension of caption embeddings.
|
||||
pooled_projection_dim (`int`, defaults to `2048`):
|
||||
The embedding dimension of pooled text projections.
|
||||
out_channels (`int`, defaults to `16`):
|
||||
The number of latent channels in the output.
|
||||
pos_embed_max_size (`int`, defaults to `96`):
|
||||
The maximum latent height/width of positional embeddings.
|
||||
extra_conditioning_channels (`int`, defaults to `0`):
|
||||
The number of extra channels to use for conditioning for patch embedding.
|
||||
dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
|
||||
The number of dual-stream transformer blocks to use.
|
||||
qk_norm (`str`, *optional*, defaults to `None`):
|
||||
The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
|
||||
pos_embed_type (`str`, defaults to `"sincos"`):
|
||||
The type of positional embedding to use. Choose between `"sincos"` and `None`.
|
||||
use_pos_embed (`bool`, defaults to `True`):
|
||||
Whether to use positional embeddings.
|
||||
force_zeros_for_pooled_projection (`bool`, defaults to `True`):
|
||||
Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
|
||||
config value of the ControlNet model.
|
||||
"""
|
||||
|
||||
_supports_gradient_checkpointing = True
|
||||
|
||||
@register_to_config
|
||||
@@ -135,7 +93,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||
JointTransformerBlock(
|
||||
dim=self.inner_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_head_dim=attention_head_dim,
|
||||
attention_head_dim=self.config.attention_head_dim,
|
||||
context_pre_only=False,
|
||||
qk_norm=qk_norm,
|
||||
use_dual_attention=True if i in dual_attention_layers else False,
|
||||
@@ -150,7 +108,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||
SD3SingleTransformerBlock(
|
||||
dim=self.inner_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_head_dim=attention_head_dim,
|
||||
attention_head_dim=self.config.attention_head_dim,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
]
|
||||
@@ -339,28 +297,28 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
hidden_states: torch.FloatTensor,
|
||||
controlnet_cond: torch.Tensor,
|
||||
conditioning_scale: float = 1.0,
|
||||
encoder_hidden_states: torch.Tensor = None,
|
||||
pooled_projections: torch.Tensor = None,
|
||||
encoder_hidden_states: torch.FloatTensor = None,
|
||||
pooled_projections: torch.FloatTensor = None,
|
||||
timestep: torch.LongTensor = None,
|
||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[torch.Tensor, Transformer2DModelOutput]:
|
||||
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
|
||||
"""
|
||||
The [`SD3Transformer2DModel`] forward method.
|
||||
|
||||
Args:
|
||||
hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
||||
Input `hidden_states`.
|
||||
controlnet_cond (`torch.Tensor`):
|
||||
The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
conditioning_scale (`float`, defaults to `1.0`):
|
||||
The scale factor for ControlNet outputs.
|
||||
encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
|
||||
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
||||
pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
|
||||
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
|
||||
from the embeddings of input conditions.
|
||||
timestep ( `torch.LongTensor`):
|
||||
Used to indicate denoising step.
|
||||
@@ -479,11 +437,11 @@ class SD3MultiControlNetModel(ModelMixin):
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
hidden_states: torch.FloatTensor,
|
||||
controlnet_cond: List[torch.tensor],
|
||||
conditioning_scale: List[float],
|
||||
pooled_projections: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor = None,
|
||||
pooled_projections: torch.FloatTensor,
|
||||
encoder_hidden_states: torch.FloatTensor = None,
|
||||
timestep: torch.LongTensor = None,
|
||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
return_dict: bool = True,
|
||||
|
||||
@@ -605,13 +605,12 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
controlnet_cond: List[torch.Tensor],
|
||||
control_type: torch.Tensor,
|
||||
control_type_idx: List[int],
|
||||
conditioning_scale: Union[float, List[float]] = 1.0,
|
||||
conditioning_scale: float = 1.0,
|
||||
class_labels: Optional[torch.Tensor] = None,
|
||||
timestep_cond: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
from_multi: bool = False,
|
||||
guess_mode: bool = False,
|
||||
return_dict: bool = True,
|
||||
) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
|
||||
@@ -648,8 +647,6 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
Additional conditions for the Stable Diffusion XL UNet.
|
||||
cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
|
||||
A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
|
||||
from_multi (`bool`, defaults to `False`):
|
||||
Use standard scaling when called from `MultiControlNetUnionModel`.
|
||||
guess_mode (`bool`, defaults to `False`):
|
||||
In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
|
||||
you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
|
||||
@@ -661,9 +658,6 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
|
||||
returned where the first element is the sample tensor.
|
||||
"""
|
||||
if isinstance(conditioning_scale, float):
|
||||
conditioning_scale = [conditioning_scale] * len(controlnet_cond)
|
||||
|
||||
# check channel order
|
||||
channel_order = self.config.controlnet_conditioning_channel_order
|
||||
|
||||
@@ -748,16 +742,12 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
inputs = []
|
||||
condition_list = []
|
||||
|
||||
for cond, control_idx, scale in zip(controlnet_cond, control_type_idx, conditioning_scale):
|
||||
for cond, control_idx in zip(controlnet_cond, control_type_idx):
|
||||
condition = self.controlnet_cond_embedding(cond)
|
||||
feat_seq = torch.mean(condition, dim=(2, 3))
|
||||
feat_seq = feat_seq + self.task_embedding[control_idx]
|
||||
if from_multi:
|
||||
inputs.append(feat_seq.unsqueeze(1))
|
||||
condition_list.append(condition)
|
||||
else:
|
||||
inputs.append(feat_seq.unsqueeze(1) * scale)
|
||||
condition_list.append(condition * scale)
|
||||
inputs.append(feat_seq.unsqueeze(1))
|
||||
condition_list.append(condition)
|
||||
|
||||
condition = sample
|
||||
feat_seq = torch.mean(condition, dim=(2, 3))
|
||||
@@ -769,13 +759,10 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
x = layer(x)
|
||||
|
||||
controlnet_cond_fuser = sample * 0.0
|
||||
for (idx, condition), scale in zip(enumerate(condition_list[:-1]), conditioning_scale):
|
||||
for idx, condition in enumerate(condition_list[:-1]):
|
||||
alpha = self.spatial_ch_projs(x[:, idx])
|
||||
alpha = alpha.unsqueeze(-1).unsqueeze(-1)
|
||||
if from_multi:
|
||||
controlnet_cond_fuser += condition + alpha
|
||||
else:
|
||||
controlnet_cond_fuser += condition + alpha * scale
|
||||
controlnet_cond_fuser += condition + alpha
|
||||
|
||||
sample = sample + controlnet_cond_fuser
|
||||
|
||||
@@ -819,13 +806,12 @@ class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
# 6. scaling
|
||||
if guess_mode and not self.config.global_pool_conditions:
|
||||
scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0
|
||||
if from_multi:
|
||||
scales = scales * conditioning_scale[0]
|
||||
scales = scales * conditioning_scale
|
||||
down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
|
||||
mid_block_res_sample = mid_block_res_sample * scales[-1] # last one
|
||||
elif from_multi:
|
||||
down_block_res_samples = [sample * conditioning_scale[0] for sample in down_block_res_samples]
|
||||
mid_block_res_sample = mid_block_res_sample * conditioning_scale[0]
|
||||
else:
|
||||
down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
|
||||
mid_block_res_sample = mid_block_res_sample * conditioning_scale
|
||||
|
||||
if self.config.global_pool_conditions:
|
||||
down_block_res_samples = [
|
||||
|
||||
@@ -47,12 +47,9 @@ class MultiControlNetUnionModel(ModelMixin):
|
||||
guess_mode: bool = False,
|
||||
return_dict: bool = True,
|
||||
) -> Union[ControlNetOutput, Tuple]:
|
||||
down_block_res_samples, mid_block_res_sample = None, None
|
||||
for i, (image, ctype, ctype_idx, scale, controlnet) in enumerate(
|
||||
zip(controlnet_cond, control_type, control_type_idx, conditioning_scale, self.nets)
|
||||
):
|
||||
if scale == 0.0:
|
||||
continue
|
||||
down_samples, mid_sample = controlnet(
|
||||
sample=sample,
|
||||
timestep=timestep,
|
||||
@@ -66,13 +63,12 @@ class MultiControlNetUnionModel(ModelMixin):
|
||||
attention_mask=attention_mask,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
from_multi=True,
|
||||
guess_mode=guess_mode,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
# merge samples
|
||||
if down_block_res_samples is None and mid_block_res_sample is None:
|
||||
if i == 0:
|
||||
down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
|
||||
else:
|
||||
down_block_res_samples = [
|
||||
|
||||
@@ -2583,11 +2583,6 @@ class MultiIPAdapterImageProjection(nn.Module):
|
||||
super().__init__()
|
||||
self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
|
||||
|
||||
@property
|
||||
def num_ip_adapters(self) -> int:
|
||||
"""Number of IP-Adapters loaded."""
|
||||
return len(self.image_projection_layers)
|
||||
|
||||
def forward(self, image_embeds: List[torch.Tensor]):
|
||||
projected_image_embeds = []
|
||||
|
||||
|
||||
@@ -866,7 +866,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
|
||||
local_files_only = kwargs.pop("local_files_only", None)
|
||||
token = kwargs.pop("token", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
device_map = kwargs.pop("device_map", None)
|
||||
max_memory = kwargs.pop("max_memory", None)
|
||||
@@ -879,12 +879,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
|
||||
dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
|
||||
disable_mmap = kwargs.pop("disable_mmap", False)
|
||||
|
||||
if not isinstance(torch_dtype, torch.dtype):
|
||||
torch_dtype = torch.float32
|
||||
logger.warning(
|
||||
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
|
||||
)
|
||||
|
||||
allow_pickle = False
|
||||
if use_safetensors is None:
|
||||
use_safetensors = True
|
||||
|
||||
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional, Tuple, Union
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
|
||||
@@ -31,7 +32,7 @@ from ...models.attention_processor import (
|
||||
)
|
||||
from ...models.modeling_utils import ModelMixin
|
||||
from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
|
||||
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
||||
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
|
||||
from ...utils.import_utils import is_torch_npu_available
|
||||
from ...utils.torch_utils import maybe_allow_in_graph
|
||||
from ..cache_utils import CacheMixin
|
||||
@@ -44,7 +45,20 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
@maybe_allow_in_graph
|
||||
class FluxSingleTransformerBlock(nn.Module):
|
||||
def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
|
||||
r"""
|
||||
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
||||
|
||||
Reference: https://arxiv.org/abs/2403.03206
|
||||
|
||||
Parameters:
|
||||
dim (`int`): The number of channels in the input and output.
|
||||
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
||||
attention_head_dim (`int`): The number of channels in each head.
|
||||
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
|
||||
processing of `context` conditions.
|
||||
"""
|
||||
|
||||
def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
|
||||
super().__init__()
|
||||
self.mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
|
||||
@@ -54,15 +68,9 @@ class FluxSingleTransformerBlock(nn.Module):
|
||||
self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
|
||||
|
||||
if is_torch_npu_available():
|
||||
deprecation_message = (
|
||||
"Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
|
||||
"should be set explicitly using the `set_attn_processor` method."
|
||||
)
|
||||
deprecate("npu_processor", "0.34.0", deprecation_message)
|
||||
processor = FluxAttnProcessor2_0_NPU()
|
||||
else:
|
||||
processor = FluxAttnProcessor2_0()
|
||||
|
||||
self.attn = Attention(
|
||||
query_dim=dim,
|
||||
cross_attention_dim=None,
|
||||
@@ -105,14 +113,39 @@ class FluxSingleTransformerBlock(nn.Module):
|
||||
|
||||
@maybe_allow_in_graph
|
||||
class FluxTransformerBlock(nn.Module):
|
||||
r"""
|
||||
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
||||
|
||||
Reference: https://arxiv.org/abs/2403.03206
|
||||
|
||||
Args:
|
||||
dim (`int`):
|
||||
The embedding dimension of the block.
|
||||
num_attention_heads (`int`):
|
||||
The number of attention heads to use.
|
||||
attention_head_dim (`int`):
|
||||
The number of dimensions to use for each attention head.
|
||||
qk_norm (`str`, defaults to `"rms_norm"`):
|
||||
The normalization to use for the query and key tensors.
|
||||
eps (`float`, defaults to `1e-6`):
|
||||
The epsilon value to use for the normalization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.norm1 = AdaLayerNormZero(dim)
|
||||
|
||||
self.norm1_context = AdaLayerNormZero(dim)
|
||||
|
||||
if hasattr(F, "scaled_dot_product_attention"):
|
||||
processor = FluxAttnProcessor2_0()
|
||||
else:
|
||||
raise ValueError(
|
||||
"The current PyTorch version does not support the `scaled_dot_product_attention` function."
|
||||
)
|
||||
self.attn = Attention(
|
||||
query_dim=dim,
|
||||
cross_attention_dim=None,
|
||||
@@ -122,7 +155,7 @@ class FluxTransformerBlock(nn.Module):
|
||||
out_dim=dim,
|
||||
context_pre_only=False,
|
||||
bias=True,
|
||||
processor=FluxAttnProcessor2_0(),
|
||||
processor=processor,
|
||||
qk_norm=qk_norm,
|
||||
eps=eps,
|
||||
)
|
||||
@@ -133,6 +166,10 @@ class FluxTransformerBlock(nn.Module):
|
||||
self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
||||
self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
||||
|
||||
# let chunk size default to None
|
||||
self._chunk_size = None
|
||||
self._chunk_dim = 0
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@@ -15,6 +15,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, SD3Transformer2DLoadersMixin
|
||||
@@ -38,6 +39,17 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
@maybe_allow_in_graph
|
||||
class SD3SingleTransformerBlock(nn.Module):
|
||||
r"""
|
||||
A Single Transformer block as part of the MMDiT architecture, used in Stable Diffusion 3 ControlNet.
|
||||
|
||||
Reference: https://arxiv.org/abs/2403.03206
|
||||
|
||||
Parameters:
|
||||
dim (`int`): The number of channels in the input and output.
|
||||
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
||||
attention_head_dim (`int`): The number of channels in each head.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
@@ -47,13 +59,21 @@ class SD3SingleTransformerBlock(nn.Module):
|
||||
super().__init__()
|
||||
|
||||
self.norm1 = AdaLayerNormZero(dim)
|
||||
|
||||
if hasattr(F, "scaled_dot_product_attention"):
|
||||
processor = JointAttnProcessor2_0()
|
||||
else:
|
||||
raise ValueError(
|
||||
"The current PyTorch version does not support the `scaled_dot_product_attention` function."
|
||||
)
|
||||
|
||||
self.attn = Attention(
|
||||
query_dim=dim,
|
||||
dim_head=attention_head_dim,
|
||||
heads=num_attention_heads,
|
||||
out_dim=dim,
|
||||
bias=True,
|
||||
processor=JointAttnProcessor2_0(),
|
||||
processor=processor,
|
||||
eps=1e-6,
|
||||
)
|
||||
|
||||
@@ -61,17 +81,23 @@ class SD3SingleTransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor):
|
||||
# 1. Attention
|
||||
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
|
||||
attn_output = self.attn(hidden_states=norm_hidden_states, encoder_hidden_states=None)
|
||||
# Attention.
|
||||
attn_output = self.attn(
|
||||
hidden_states=norm_hidden_states,
|
||||
encoder_hidden_states=None,
|
||||
)
|
||||
|
||||
# Process attention outputs for the `hidden_states`.
|
||||
attn_output = gate_msa.unsqueeze(1) * attn_output
|
||||
hidden_states = hidden_states + attn_output
|
||||
|
||||
# 2. Feed Forward
|
||||
norm_hidden_states = self.norm2(hidden_states)
|
||||
norm_hidden_states = norm_hidden_states * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
|
||||
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
|
||||
|
||||
ff_output = self.ff(norm_hidden_states)
|
||||
ff_output = gate_mlp.unsqueeze(1) * ff_output
|
||||
|
||||
hidden_states = hidden_states + ff_output
|
||||
|
||||
return hidden_states
|
||||
@@ -81,40 +107,26 @@ class SD3Transformer2DModel(
|
||||
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, SD3Transformer2DLoadersMixin
|
||||
):
|
||||
"""
|
||||
The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
|
||||
The Transformer model introduced in Stable Diffusion 3.
|
||||
|
||||
Reference: https://arxiv.org/abs/2403.03206
|
||||
|
||||
Parameters:
|
||||
sample_size (`int`, defaults to `128`):
|
||||
The width/height of the latents. This is fixed during training since it is used to learn a number of
|
||||
position embeddings.
|
||||
patch_size (`int`, defaults to `2`):
|
||||
Patch size to turn the input data into small patches.
|
||||
in_channels (`int`, defaults to `16`):
|
||||
The number of latent channels in the input.
|
||||
num_layers (`int`, defaults to `18`):
|
||||
The number of layers of transformer blocks to use.
|
||||
attention_head_dim (`int`, defaults to `64`):
|
||||
The number of channels in each head.
|
||||
num_attention_heads (`int`, defaults to `18`):
|
||||
The number of heads to use for multi-head attention.
|
||||
joint_attention_dim (`int`, defaults to `4096`):
|
||||
The embedding dimension to use for joint text-image attention.
|
||||
caption_projection_dim (`int`, defaults to `1152`):
|
||||
The embedding dimension of caption embeddings.
|
||||
pooled_projection_dim (`int`, defaults to `2048`):
|
||||
The embedding dimension of pooled text projections.
|
||||
out_channels (`int`, defaults to `16`):
|
||||
The number of latent channels in the output.
|
||||
pos_embed_max_size (`int`, defaults to `96`):
|
||||
The maximum latent height/width of positional embeddings.
|
||||
dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
|
||||
The number of dual-stream transformer blocks to use.
|
||||
qk_norm (`str`, *optional*, defaults to `None`):
|
||||
The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
|
||||
sample_size (`int`): The width of the latent images. This is fixed during training since
|
||||
it is used to learn a number of position embeddings.
|
||||
patch_size (`int`): Patch size to turn the input data into small patches.
|
||||
in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
|
||||
num_layers (`int`, *optional*, defaults to 18): The number of layers of Transformer blocks to use.
|
||||
attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
|
||||
num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
|
||||
cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
|
||||
caption_projection_dim (`int`): Number of dimensions to use when projecting the `encoder_hidden_states`.
|
||||
pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
|
||||
out_channels (`int`, defaults to 16): Number of output channels.
|
||||
|
||||
"""
|
||||
|
||||
_supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["JointTransformerBlock"]
|
||||
_skip_layerwise_casting_patterns = ["pos_embed", "norm"]
|
||||
|
||||
@register_to_config
|
||||
@@ -137,33 +149,36 @@ class SD3Transformer2DModel(
|
||||
qk_norm: Optional[str] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels if out_channels is not None else in_channels
|
||||
self.inner_dim = num_attention_heads * attention_head_dim
|
||||
default_out_channels = in_channels
|
||||
self.out_channels = out_channels if out_channels is not None else default_out_channels
|
||||
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
||||
|
||||
self.pos_embed = PatchEmbed(
|
||||
height=sample_size,
|
||||
width=sample_size,
|
||||
patch_size=patch_size,
|
||||
in_channels=in_channels,
|
||||
height=self.config.sample_size,
|
||||
width=self.config.sample_size,
|
||||
patch_size=self.config.patch_size,
|
||||
in_channels=self.config.in_channels,
|
||||
embed_dim=self.inner_dim,
|
||||
pos_embed_max_size=pos_embed_max_size, # hard-code for now.
|
||||
)
|
||||
self.time_text_embed = CombinedTimestepTextProjEmbeddings(
|
||||
embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
|
||||
embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
|
||||
)
|
||||
self.context_embedder = nn.Linear(joint_attention_dim, caption_projection_dim)
|
||||
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
|
||||
|
||||
# `attention_head_dim` is doubled to account for the mixing.
|
||||
# It needs to crafted when we get the actual checkpoints.
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
JointTransformerBlock(
|
||||
dim=self.inner_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_head_dim=attention_head_dim,
|
||||
num_attention_heads=self.config.num_attention_heads,
|
||||
attention_head_dim=self.config.attention_head_dim,
|
||||
context_pre_only=i == num_layers - 1,
|
||||
qk_norm=qk_norm,
|
||||
use_dual_attention=True if i in dual_attention_layers else False,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
for i in range(self.config.num_layers)
|
||||
]
|
||||
)
|
||||
|
||||
@@ -316,24 +331,24 @@ class SD3Transformer2DModel(
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor = None,
|
||||
pooled_projections: torch.Tensor = None,
|
||||
hidden_states: torch.FloatTensor,
|
||||
encoder_hidden_states: torch.FloatTensor = None,
|
||||
pooled_projections: torch.FloatTensor = None,
|
||||
timestep: torch.LongTensor = None,
|
||||
block_controlnet_hidden_states: List = None,
|
||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
return_dict: bool = True,
|
||||
skip_layers: Optional[List[int]] = None,
|
||||
) -> Union[torch.Tensor, Transformer2DModelOutput]:
|
||||
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
|
||||
"""
|
||||
The [`SD3Transformer2DModel`] forward method.
|
||||
|
||||
Args:
|
||||
hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
||||
Input `hidden_states`.
|
||||
encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
|
||||
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
||||
pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
|
||||
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
|
||||
Embeddings projected from the embeddings of input conditions.
|
||||
timestep (`torch.LongTensor`):
|
||||
Used to indicate denoising step.
|
||||
|
||||
@@ -261,7 +261,6 @@ else:
|
||||
_import_structure["marigold"].extend(
|
||||
[
|
||||
"MarigoldDepthPipeline",
|
||||
"MarigoldIntrinsicsPipeline",
|
||||
"MarigoldNormalsPipeline",
|
||||
]
|
||||
)
|
||||
@@ -604,7 +603,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from .lumina2 import Lumina2Text2ImgPipeline
|
||||
from .marigold import (
|
||||
MarigoldDepthPipeline,
|
||||
MarigoldIntrinsicsPipeline,
|
||||
MarigoldNormalsPipeline,
|
||||
)
|
||||
from .mochi import MochiPipeline
|
||||
|
||||
@@ -34,10 +34,6 @@ from .controlnet import (
|
||||
StableDiffusionXLControlNetUnionInpaintPipeline,
|
||||
StableDiffusionXLControlNetUnionPipeline,
|
||||
)
|
||||
from .controlnet_sd3 import (
|
||||
StableDiffusion3ControlNetInpaintingPipeline,
|
||||
StableDiffusion3ControlNetPipeline,
|
||||
)
|
||||
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
|
||||
from .flux import (
|
||||
FluxControlImg2ImgPipeline,
|
||||
@@ -124,7 +120,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
||||
("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
|
||||
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
|
||||
("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionPipeline),
|
||||
("stable-diffusion-3-controlnet", StableDiffusion3ControlNetPipeline),
|
||||
("wuerstchen", WuerstchenCombinedPipeline),
|
||||
("cascade", StableCascadeCombinedPipeline),
|
||||
("lcm", LatentConsistencyModelPipeline),
|
||||
@@ -183,7 +178,6 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
|
||||
("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
|
||||
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
|
||||
("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionInpaintPipeline),
|
||||
("stable-diffusion-3-controlnet", StableDiffusion3ControlNetInpaintingPipeline),
|
||||
("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
|
||||
("flux", FluxInpaintPipeline),
|
||||
("flux-controlnet", FluxControlNetInpaintPipeline),
|
||||
|
||||
@@ -207,7 +207,7 @@ class StableDiffusionControlNetPipeline(
|
||||
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "image"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1323,7 +1323,6 @@ class StableDiffusionControlNetPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
image = callback_outputs.pop("image", image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -185,7 +185,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
model_cpu_offload_seq = "text_encoder->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1294,7 +1294,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -184,7 +184,7 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1476,7 +1476,6 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -237,7 +237,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
"add_neg_time_ids",
|
||||
"mask",
|
||||
"masked_image_latents",
|
||||
"control_image",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
@@ -744,7 +743,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
if padding_mask_crop is not None:
|
||||
if not isinstance(image, PIL.Image.Image):
|
||||
raise ValueError(
|
||||
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
|
||||
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
|
||||
)
|
||||
if not isinstance(mask_image, PIL.Image.Image):
|
||||
raise ValueError(
|
||||
@@ -752,7 +751,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
f" {type(mask_image)}."
|
||||
)
|
||||
if output_type != "pil":
|
||||
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
|
||||
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
|
||||
|
||||
if prompt_embeds is not None and pooled_prompt_embeds is None:
|
||||
raise ValueError(
|
||||
@@ -1645,7 +1644,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
|
||||
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
|
||||
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
|
||||
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
|
||||
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
|
||||
" `pipeline.unet` or your `mask_image` or `image` input."
|
||||
)
|
||||
elif num_channels_unet != 4:
|
||||
@@ -1836,7 +1835,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -242,7 +242,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
"add_time_ids",
|
||||
"negative_pooled_prompt_embeds",
|
||||
"add_neg_time_ids",
|
||||
"control_image",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
@@ -1615,7 +1614,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
)
|
||||
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
||||
add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -219,7 +219,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
|
||||
"add_time_ids",
|
||||
"mask",
|
||||
"masked_image_latents",
|
||||
"control_image",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
@@ -727,7 +726,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
|
||||
if padding_mask_crop is not None:
|
||||
if not isinstance(image, PIL.Image.Image):
|
||||
raise ValueError(
|
||||
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
|
||||
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
|
||||
)
|
||||
if not isinstance(mask_image, PIL.Image.Image):
|
||||
raise ValueError(
|
||||
@@ -735,7 +734,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
|
||||
f" {type(mask_image)}."
|
||||
)
|
||||
if output_type != "pil":
|
||||
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
|
||||
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
|
||||
|
||||
if prompt_embeds is not None and pooled_prompt_embeds is None:
|
||||
raise ValueError(
|
||||
@@ -1744,7 +1743,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -757,9 +757,15 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
for images_ in image:
|
||||
for image_ in images_:
|
||||
self.check_image(image_, prompt, prompt_embeds)
|
||||
else:
|
||||
assert False
|
||||
|
||||
# Check `controlnet_conditioning_scale`
|
||||
if isinstance(controlnet, MultiControlNetUnionModel):
|
||||
# TODO Update for https://github.com/huggingface/diffusers/pull/10723
|
||||
if isinstance(controlnet, ControlNetUnionModel):
|
||||
if not isinstance(controlnet_conditioning_scale, float):
|
||||
raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
|
||||
elif isinstance(controlnet, MultiControlNetUnionModel):
|
||||
if isinstance(controlnet_conditioning_scale, list):
|
||||
if any(isinstance(i, list) for i in controlnet_conditioning_scale):
|
||||
raise ValueError("A single batch of multiple conditionings is not supported at the moment.")
|
||||
@@ -770,6 +776,8 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
"For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
|
||||
" the same length as the number of controlnets"
|
||||
)
|
||||
else:
|
||||
assert False
|
||||
|
||||
if len(control_guidance_start) != len(control_guidance_end):
|
||||
raise ValueError(
|
||||
@@ -800,6 +808,8 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
for _control_mode, _controlnet in zip(control_mode, self.controlnet.nets):
|
||||
if max(_control_mode) >= _controlnet.config.num_control_type:
|
||||
raise ValueError(f"control_mode: must be lower than {_controlnet.config.num_control_type}.")
|
||||
else:
|
||||
assert False
|
||||
|
||||
# Equal number of `image` and `control_mode` elements
|
||||
if isinstance(controlnet, ControlNetUnionModel):
|
||||
@@ -813,6 +823,8 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
|
||||
elif sum(len(x) for x in image) != sum(len(x) for x in control_mode):
|
||||
raise ValueError("Expected len(control_image) == len(control_mode)")
|
||||
else:
|
||||
assert False
|
||||
|
||||
if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
|
||||
raise ValueError(
|
||||
@@ -1189,6 +1201,18 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
|
||||
control_guidance_start = len(control_guidance_end) * [control_guidance_start]
|
||||
elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
|
||||
control_guidance_end = len(control_guidance_start) * [control_guidance_end]
|
||||
elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
|
||||
mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetUnionModel) else 1
|
||||
control_guidance_start, control_guidance_end = (
|
||||
mult * [control_guidance_start],
|
||||
mult * [control_guidance_end],
|
||||
)
|
||||
|
||||
if not isinstance(control_image, list):
|
||||
control_image = [control_image]
|
||||
else:
|
||||
@@ -1197,25 +1221,8 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
if not isinstance(control_mode, list):
|
||||
control_mode = [control_mode]
|
||||
|
||||
if isinstance(controlnet, MultiControlNetUnionModel):
|
||||
control_image = [[item] for item in control_image]
|
||||
control_mode = [[item] for item in control_mode]
|
||||
|
||||
# align format for control guidance
|
||||
if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
|
||||
control_guidance_start = len(control_guidance_end) * [control_guidance_start]
|
||||
elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
|
||||
control_guidance_end = len(control_guidance_start) * [control_guidance_end]
|
||||
elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
|
||||
mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetUnionModel) else len(control_mode)
|
||||
control_guidance_start, control_guidance_end = (
|
||||
mult * [control_guidance_start],
|
||||
mult * [control_guidance_end],
|
||||
)
|
||||
|
||||
if isinstance(controlnet_conditioning_scale, float):
|
||||
mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetUnionModel) else len(control_mode)
|
||||
controlnet_conditioning_scale = [controlnet_conditioning_scale] * mult
|
||||
if isinstance(controlnet, MultiControlNetUnionModel) and isinstance(controlnet_conditioning_scale, float):
|
||||
controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(
|
||||
@@ -1350,6 +1357,9 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
control_image = control_images
|
||||
height, width = control_image[0][0].shape[-2:]
|
||||
|
||||
else:
|
||||
assert False
|
||||
|
||||
# 5. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
||||
@@ -1387,7 +1397,7 @@ class StableDiffusionXLControlNetUnionPipeline(
|
||||
1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
|
||||
for s, e in zip(control_guidance_start, control_guidance_end)
|
||||
]
|
||||
controlnet_keep.append(keeps)
|
||||
controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetUnionModel) else keeps)
|
||||
|
||||
# 7.2 Prepare added time ids & embeddings
|
||||
original_size = original_size or (height, width)
|
||||
|
||||
@@ -252,7 +252,12 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
||||
"feature_extractor",
|
||||
"image_encoder",
|
||||
]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "add_text_embeds", "add_time_ids", "control_image"]
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1557,7 +1562,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
||||
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
||||
control_image = callback_outputs.pop("control_image", control_image)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -405,28 +405,23 @@ class FluxPipeline(
|
||||
if not isinstance(ip_adapter_image, list):
|
||||
ip_adapter_image = [ip_adapter_image]
|
||||
|
||||
if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
||||
if len(ip_adapter_image) != len(self.transformer.encoder_hid_proj.image_projection_layers):
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.transformer.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
||||
)
|
||||
|
||||
for single_ip_adapter_image in ip_adapter_image:
|
||||
for single_ip_adapter_image, image_proj_layer in zip(
|
||||
ip_adapter_image, self.transformer.encoder_hid_proj.image_projection_layers
|
||||
):
|
||||
single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
|
||||
|
||||
image_embeds.append(single_image_embeds[None, :])
|
||||
else:
|
||||
if not isinstance(ip_adapter_image_embeds, list):
|
||||
ip_adapter_image_embeds = [ip_adapter_image_embeds]
|
||||
|
||||
if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
||||
)
|
||||
|
||||
for single_image_embeds in ip_adapter_image_embeds:
|
||||
image_embeds.append(single_image_embeds)
|
||||
|
||||
ip_adapter_image_embeds = []
|
||||
for single_image_embeds in image_embeds:
|
||||
for i, single_image_embeds in enumerate(image_embeds):
|
||||
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
||||
single_image_embeds = single_image_embeds.to(device=device)
|
||||
ip_adapter_image_embeds.append(single_image_embeds)
|
||||
@@ -877,13 +872,10 @@ class FluxPipeline(
|
||||
negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
|
||||
):
|
||||
negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
||||
negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
|
||||
|
||||
elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
|
||||
negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
|
||||
):
|
||||
ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
||||
ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
|
||||
|
||||
if self.joint_attention_kwargs is None:
|
||||
self._joint_attention_kwargs = {}
|
||||
|
||||
@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
|
||||
else:
|
||||
_import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
|
||||
_import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
|
||||
_import_structure["pipeline_marigold_intrinsics"] = ["MarigoldIntrinsicsOutput", "MarigoldIntrinsicsPipeline"]
|
||||
_import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
@@ -36,7 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
else:
|
||||
from .marigold_image_processing import MarigoldImageProcessor
|
||||
from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
|
||||
from .pipeline_marigold_intrinsics import MarigoldIntrinsicsOutput, MarigoldIntrinsicsPipeline
|
||||
from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline
|
||||
|
||||
else:
|
||||
|
||||
@@ -1,22 +1,4 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
@@ -397,7 +379,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
val_min: float = 0.0,
|
||||
val_max: float = 1.0,
|
||||
color_map: str = "Spectral",
|
||||
) -> List[PIL.Image.Image]:
|
||||
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
|
||||
"""
|
||||
Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.
|
||||
|
||||
@@ -409,7 +391,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
|
||||
depth prediction into colored representation.
|
||||
|
||||
Returns: `List[PIL.Image.Image]` with depth maps visualization.
|
||||
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
|
||||
"""
|
||||
if val_max <= val_min:
|
||||
raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
|
||||
@@ -454,7 +436,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
|
||||
val_min: float = 0.0,
|
||||
val_max: float = 1.0,
|
||||
) -> List[PIL.Image.Image]:
|
||||
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
|
||||
def export_depth_to_16bit_png_one(img, idx=None):
|
||||
prefix = "Depth" + (f"[{idx}]" if idx else "")
|
||||
if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
|
||||
@@ -496,7 +478,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
flip_x: bool = False,
|
||||
flip_y: bool = False,
|
||||
flip_z: bool = False,
|
||||
) -> List[PIL.Image.Image]:
|
||||
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
|
||||
"""
|
||||
Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.
|
||||
|
||||
@@ -510,7 +492,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
|
||||
Default direction is facing the observer.
|
||||
|
||||
Returns: `List[PIL.Image.Image]` with surface normals visualization.
|
||||
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
|
||||
"""
|
||||
flip_vec = None
|
||||
if any((flip_x, flip_y, flip_z)):
|
||||
@@ -546,99 +528,6 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
else:
|
||||
raise ValueError(f"Unexpected input type: {type(normals)}")
|
||||
|
||||
@staticmethod
|
||||
def visualize_intrinsics(
|
||||
prediction: Union[
|
||||
np.ndarray,
|
||||
torch.Tensor,
|
||||
List[np.ndarray],
|
||||
List[torch.Tensor],
|
||||
],
|
||||
target_properties: Dict[str, Any],
|
||||
color_map: Union[str, Dict[str, str]] = "binary",
|
||||
) -> List[Dict[str, PIL.Image.Image]]:
|
||||
"""
|
||||
Visualizes intrinsic image decomposition, such as predictions of the `MarigoldIntrinsicsPipeline`.
|
||||
|
||||
Args:
|
||||
prediction (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
|
||||
Intrinsic image decomposition.
|
||||
target_properties (`Dict[str, Any]`):
|
||||
Decomposition properties. Expected entries: `target_names: List[str]` and a dictionary with keys
|
||||
`prediction_space: str`, `sub_target_names: List[Union[str, Null]]` (must have 3 entries, null for
|
||||
missing modalities), `up_to_scale: bool`, one for each target and sub-target.
|
||||
color_map (`Union[str, Dict[str, str]]`, *optional*, defaults to `"Spectral"`):
|
||||
Color map used to convert a single-channel predictions into colored representations. When a dictionary
|
||||
is passed, each modality can be colored with its own color map.
|
||||
|
||||
Returns: `List[Dict[str, PIL.Image.Image]]` with intrinsic image decomposition visualization.
|
||||
"""
|
||||
if "target_names" not in target_properties:
|
||||
raise ValueError("Missing `target_names` in target_properties")
|
||||
if not isinstance(color_map, str) and not (
|
||||
isinstance(color_map, dict)
|
||||
and all(isinstance(k, str) and isinstance(v, str) for k, v in color_map.items())
|
||||
):
|
||||
raise ValueError("`color_map` must be a string or a dictionary of strings")
|
||||
n_targets = len(target_properties["target_names"])
|
||||
|
||||
def visualize_targets_one(images, idx=None):
|
||||
# img: [T, 3, H, W]
|
||||
out = {}
|
||||
for target_name, img in zip(target_properties["target_names"], images):
|
||||
img = img.permute(1, 2, 0) # [H, W, 3]
|
||||
prediction_space = target_properties[target_name].get("prediction_space", "srgb")
|
||||
if prediction_space == "stack":
|
||||
sub_target_names = target_properties[target_name]["sub_target_names"]
|
||||
if len(sub_target_names) != 3 or any(
|
||||
not (isinstance(s, str) or s is None) for s in sub_target_names
|
||||
):
|
||||
raise ValueError(f"Unexpected target sub-names {sub_target_names} in {target_name}")
|
||||
for i, sub_target_name in enumerate(sub_target_names):
|
||||
if sub_target_name is None:
|
||||
continue
|
||||
sub_img = img[:, :, i]
|
||||
sub_prediction_space = target_properties[sub_target_name].get("prediction_space", "srgb")
|
||||
if sub_prediction_space == "linear":
|
||||
sub_up_to_scale = target_properties[sub_target_name].get("up_to_scale", False)
|
||||
if sub_up_to_scale:
|
||||
sub_img = sub_img / max(sub_img.max().item(), 1e-6)
|
||||
sub_img = sub_img ** (1 / 2.2)
|
||||
cmap_name = (
|
||||
color_map if isinstance(color_map, str) else color_map.get(sub_target_name, "binary")
|
||||
)
|
||||
sub_img = MarigoldImageProcessor.colormap(sub_img, cmap=cmap_name, bytes=True)
|
||||
sub_img = PIL.Image.fromarray(sub_img.cpu().numpy())
|
||||
out[sub_target_name] = sub_img
|
||||
elif prediction_space == "linear":
|
||||
up_to_scale = target_properties[target_name].get("up_to_scale", False)
|
||||
if up_to_scale:
|
||||
img = img / max(img.max().item(), 1e-6)
|
||||
img = img ** (1 / 2.2)
|
||||
elif prediction_space == "srgb":
|
||||
pass
|
||||
img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
|
||||
img = PIL.Image.fromarray(img)
|
||||
out[target_name] = img
|
||||
return out
|
||||
|
||||
if prediction is None or isinstance(prediction, list) and any(o is None for o in prediction):
|
||||
raise ValueError("Input prediction is `None`")
|
||||
if isinstance(prediction, (np.ndarray, torch.Tensor)):
|
||||
prediction = MarigoldImageProcessor.expand_tensor_or_array(prediction)
|
||||
if isinstance(prediction, np.ndarray):
|
||||
prediction = MarigoldImageProcessor.numpy_to_pt(prediction) # [N*T,3,H,W]
|
||||
if not (prediction.ndim == 4 and prediction.shape[1] == 3 and prediction.shape[0] % n_targets == 0):
|
||||
raise ValueError(f"Unexpected input shape={prediction.shape}, expecting [N*T,3,H,W].")
|
||||
N_T, _, H, W = prediction.shape
|
||||
N = N_T // n_targets
|
||||
prediction = prediction.reshape(N, n_targets, 3, H, W)
|
||||
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
|
||||
elif isinstance(prediction, list):
|
||||
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
|
||||
else:
|
||||
raise ValueError(f"Unexpected input type: {type(prediction)}")
|
||||
|
||||
@staticmethod
|
||||
def visualize_uncertainty(
|
||||
uncertainty: Union[
|
||||
@@ -648,10 +537,9 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
List[torch.Tensor],
|
||||
],
|
||||
saturation_percentile=95,
|
||||
) -> List[PIL.Image.Image]:
|
||||
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
|
||||
"""
|
||||
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline`, `MarigoldNormalsPipeline`, or
|
||||
`MarigoldIntrinsicsPipeline`.
|
||||
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.
|
||||
|
||||
Args:
|
||||
uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
|
||||
@@ -659,15 +547,14 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
saturation_percentile (`int`, *optional*, defaults to `95`):
|
||||
Specifies the percentile uncertainty value visualized with maximum intensity.
|
||||
|
||||
Returns: `List[PIL.Image.Image]` with uncertainty visualization.
|
||||
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
|
||||
"""
|
||||
|
||||
def visualize_uncertainty_one(img, idx=None):
|
||||
prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
|
||||
if img.min() < 0:
|
||||
raise ValueError(f"{prefix}: unexpected data range, min={img.min()}.")
|
||||
img = img.permute(1, 2, 0) # [H,W,C]
|
||||
img = img.squeeze(2).cpu().numpy() # [H,W] or [H,W,3]
|
||||
raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
|
||||
img = img.squeeze(0).cpu().numpy()
|
||||
saturation_value = np.percentile(img, saturation_percentile)
|
||||
img = np.clip(img * 255 / saturation_value, 0, 255)
|
||||
img = img.astype(np.uint8)
|
||||
@@ -679,9 +566,9 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
|
||||
uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
|
||||
if isinstance(uncertainty, np.ndarray):
|
||||
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,C,H,W]
|
||||
if not (uncertainty.ndim == 4 and uncertainty.shape[1] in (1, 3)):
|
||||
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,C,H,W] with C in (1,3).")
|
||||
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,1,H,W]
|
||||
if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
|
||||
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
|
||||
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
|
||||
elif isinstance(uncertainty, list):
|
||||
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# Marigold project website: https://marigoldmonodepth.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
@@ -64,7 +64,7 @@ Examples:
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
... "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
... "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
@@ -86,12 +86,11 @@ class MarigoldDepthOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
prediction (`np.ndarray`, `torch.Tensor`):
|
||||
Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
|
||||
width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
|
||||
Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
|
||||
\times width$, regardless of whether the images were passed as a 4D array or a list.
|
||||
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
|
||||
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
|
||||
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
|
||||
for `np.ndarray`.
|
||||
\times 1 \times height \times width$.
|
||||
latent (`None`, `torch.Tensor`):
|
||||
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
||||
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
||||
@@ -209,11 +208,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
output_type: str,
|
||||
output_uncertainty: bool,
|
||||
) -> int:
|
||||
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
if actual_vae_scale_factor != self.vae_scale_factor:
|
||||
raise ValueError(
|
||||
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
|
||||
)
|
||||
if num_inference_steps is None:
|
||||
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
|
||||
if num_inference_steps < 1:
|
||||
@@ -326,7 +320,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
|
||||
return num_images
|
||||
|
||||
@torch.compiler.disable
|
||||
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
|
||||
if not hasattr(self, "_progress_bar_config"):
|
||||
self._progress_bar_config = {}
|
||||
@@ -377,9 +370,11 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
same width and height.
|
||||
num_inference_steps (`int`, *optional*, defaults to `None`):
|
||||
Number of denoising diffusion steps during inference. The default value `None` results in automatic
|
||||
selection.
|
||||
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
|
||||
for Marigold-LCM models.
|
||||
ensemble_size (`int`, defaults to `1`):
|
||||
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
|
||||
Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
|
||||
faster inference.
|
||||
processing_resolution (`int`, *optional*, defaults to `None`):
|
||||
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
|
||||
produces crisper predictions, but may also lead to the overall loss of global context. The default
|
||||
@@ -491,7 +486,9 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
# `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
|
||||
# into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
|
||||
# reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
|
||||
# code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
|
||||
# code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
|
||||
# as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
|
||||
# noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
|
||||
# dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
|
||||
# Model invocation: self.vae.encoder.
|
||||
image_latent, pred_latent = self.prepare_latents(
|
||||
@@ -736,7 +733,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
param = init_s.cpu().numpy()
|
||||
else:
|
||||
raise ValueError("Unrecognized alignment.")
|
||||
param = param.astype(np.float64)
|
||||
|
||||
return param
|
||||
|
||||
@@ -779,7 +775,7 @@ class MarigoldDepthPipeline(DiffusionPipeline):
|
||||
|
||||
if regularizer_strength > 0:
|
||||
prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
|
||||
err_near = prediction.min().abs().item()
|
||||
err_near = (0.0 - prediction.min()).abs().item()
|
||||
err_far = (1.0 - prediction.max()).abs().item()
|
||||
cost += (err_near + err_far) * regularizer_strength
|
||||
|
||||
|
||||
@@ -1,721 +0,0 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from ...image_processor import PipelineImageInput
|
||||
from ...models import (
|
||||
AutoencoderKL,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from ...schedulers import (
|
||||
DDIMScheduler,
|
||||
LCMScheduler,
|
||||
)
|
||||
from ...utils import (
|
||||
BaseOutput,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ..pipeline_utils import DiffusionPipeline
|
||||
from .marigold_image_processing import MarigoldImageProcessor
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```py
|
||||
>>> import diffusers
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
|
||||
... "prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
>>> intrinsics = pipe(image)
|
||||
|
||||
>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
|
||||
>>> vis[0]["albedo"].save("einstein_albedo.png")
|
||||
>>> vis[0]["roughness"].save("einstein_roughness.png")
|
||||
>>> vis[0]["metallicity"].save("einstein_metallicity.png")
|
||||
```
|
||||
```py
|
||||
>>> import diffusers
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
|
||||
... "prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
>>> intrinsics = pipe(image)
|
||||
|
||||
>>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
|
||||
>>> vis[0]["albedo"].save("einstein_albedo.png")
|
||||
>>> vis[0]["shading"].save("einstein_shading.png")
|
||||
>>> vis[0]["residual"].save("einstein_residual.png")
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarigoldIntrinsicsOutput(BaseOutput):
|
||||
"""
|
||||
Output class for Marigold Intrinsic Image Decomposition pipeline.
|
||||
|
||||
Args:
|
||||
prediction (`np.ndarray`, `torch.Tensor`):
|
||||
Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
|
||||
\times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
|
||||
\times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
|
||||
the intrinsic image decomposition.
|
||||
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
|
||||
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
|
||||
numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
|
||||
height \times width \times 3$ for `np.ndarray`.
|
||||
latent (`None`, `torch.Tensor`):
|
||||
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
||||
The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
|
||||
"""
|
||||
|
||||
prediction: Union[np.ndarray, torch.Tensor]
|
||||
uncertainty: Union[None, np.ndarray, torch.Tensor]
|
||||
latent: Union[None, torch.Tensor]
|
||||
|
||||
|
||||
class MarigoldIntrinsicsPipeline(DiffusionPipeline):
|
||||
"""
|
||||
Pipeline for Intrinsic Image Decomposition (IID) using the Marigold method:
|
||||
https://marigoldcomputervision.github.io.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
unet (`UNet2DConditionModel`):
|
||||
Conditional U-Net to denoise the targets latent, conditioned on image latent.
|
||||
vae (`AutoencoderKL`):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
|
||||
representations.
|
||||
scheduler (`DDIMScheduler` or `LCMScheduler`):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latents.
|
||||
text_encoder (`CLIPTextModel`):
|
||||
Text-encoder, for empty text embedding.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
CLIP tokenizer.
|
||||
prediction_type (`str`, *optional*):
|
||||
Type of predictions made by the model.
|
||||
target_properties (`Dict[str, Any]`, *optional*):
|
||||
Properties of the predicted modalities, such as `target_names`, a `List[str]` used to define the number,
|
||||
order and names of the predicted modalities, and any other metadata that may be required to interpret the
|
||||
predictions.
|
||||
default_denoising_steps (`int`, *optional*):
|
||||
The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
|
||||
quality with the given model. This value must be set in the model config. When the pipeline is called
|
||||
without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
|
||||
reasonable results with various model flavors compatible with the pipeline, such as those relying on very
|
||||
short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
|
||||
default_processing_resolution (`int`, *optional*):
|
||||
The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
|
||||
the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
|
||||
default value is used. This is required to ensure reasonable results with various model flavors trained
|
||||
with varying optimal processing resolution values.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->unet->vae"
|
||||
supported_prediction_types = ("intrinsics",)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
unet: UNet2DConditionModel,
|
||||
vae: AutoencoderKL,
|
||||
scheduler: Union[DDIMScheduler, LCMScheduler],
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
prediction_type: Optional[str] = None,
|
||||
target_properties: Optional[Dict[str, Any]] = None,
|
||||
default_denoising_steps: Optional[int] = None,
|
||||
default_processing_resolution: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if prediction_type not in self.supported_prediction_types:
|
||||
logger.warning(
|
||||
f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
|
||||
f"{self.supported_prediction_types}."
|
||||
)
|
||||
|
||||
self.register_modules(
|
||||
unet=unet,
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
self.register_to_config(
|
||||
prediction_type=prediction_type,
|
||||
target_properties=target_properties,
|
||||
default_denoising_steps=default_denoising_steps,
|
||||
default_processing_resolution=default_processing_resolution,
|
||||
)
|
||||
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
||||
|
||||
self.target_properties = target_properties
|
||||
self.default_denoising_steps = default_denoising_steps
|
||||
self.default_processing_resolution = default_processing_resolution
|
||||
|
||||
self.empty_text_embedding = None
|
||||
|
||||
self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
|
||||
@property
|
||||
def n_targets(self):
|
||||
return self.unet.config.out_channels // self.vae.config.latent_channels
|
||||
|
||||
def check_inputs(
|
||||
self,
|
||||
image: PipelineImageInput,
|
||||
num_inference_steps: int,
|
||||
ensemble_size: int,
|
||||
processing_resolution: int,
|
||||
resample_method_input: str,
|
||||
resample_method_output: str,
|
||||
batch_size: int,
|
||||
ensembling_kwargs: Optional[Dict[str, Any]],
|
||||
latents: Optional[torch.Tensor],
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]],
|
||||
output_type: str,
|
||||
output_uncertainty: bool,
|
||||
) -> int:
|
||||
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
if actual_vae_scale_factor != self.vae_scale_factor:
|
||||
raise ValueError(
|
||||
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
|
||||
)
|
||||
if num_inference_steps is None:
|
||||
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
|
||||
if num_inference_steps < 1:
|
||||
raise ValueError("`num_inference_steps` must be positive.")
|
||||
if ensemble_size < 1:
|
||||
raise ValueError("`ensemble_size` must be positive.")
|
||||
if ensemble_size == 2:
|
||||
logger.warning(
|
||||
"`ensemble_size` == 2 results are similar to no ensembling (1); "
|
||||
"consider increasing the value to at least 3."
|
||||
)
|
||||
if ensemble_size == 1 and output_uncertainty:
|
||||
raise ValueError(
|
||||
"Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
|
||||
"greater than 1."
|
||||
)
|
||||
if processing_resolution is None:
|
||||
raise ValueError(
|
||||
"`processing_resolution` is not specified and could not be resolved from the model config."
|
||||
)
|
||||
if processing_resolution < 0:
|
||||
raise ValueError(
|
||||
"`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
|
||||
"downsampled processing."
|
||||
)
|
||||
if processing_resolution % self.vae_scale_factor != 0:
|
||||
raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
|
||||
if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
|
||||
raise ValueError(
|
||||
"`resample_method_input` takes string values compatible with PIL library: "
|
||||
"nearest, nearest-exact, bilinear, bicubic, area."
|
||||
)
|
||||
if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
|
||||
raise ValueError(
|
||||
"`resample_method_output` takes string values compatible with PIL library: "
|
||||
"nearest, nearest-exact, bilinear, bicubic, area."
|
||||
)
|
||||
if batch_size < 1:
|
||||
raise ValueError("`batch_size` must be positive.")
|
||||
if output_type not in ["pt", "np"]:
|
||||
raise ValueError("`output_type` must be one of `pt` or `np`.")
|
||||
if latents is not None and generator is not None:
|
||||
raise ValueError("`latents` and `generator` cannot be used together.")
|
||||
if ensembling_kwargs is not None:
|
||||
if not isinstance(ensembling_kwargs, dict):
|
||||
raise ValueError("`ensembling_kwargs` must be a dictionary.")
|
||||
if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("median", "mean"):
|
||||
raise ValueError("`ensembling_kwargs['reduction']` can be either `'median'` or `'mean'`.")
|
||||
|
||||
# image checks
|
||||
num_images = 0
|
||||
W, H = None, None
|
||||
if not isinstance(image, list):
|
||||
image = [image]
|
||||
for i, img in enumerate(image):
|
||||
if isinstance(img, np.ndarray) or torch.is_tensor(img):
|
||||
if img.ndim not in (2, 3, 4):
|
||||
raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
|
||||
H_i, W_i = img.shape[-2:]
|
||||
N_i = 1
|
||||
if img.ndim == 4:
|
||||
N_i = img.shape[0]
|
||||
elif isinstance(img, Image.Image):
|
||||
W_i, H_i = img.size
|
||||
N_i = 1
|
||||
else:
|
||||
raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
|
||||
if W is None:
|
||||
W, H = W_i, H_i
|
||||
elif (W, H) != (W_i, H_i):
|
||||
raise ValueError(
|
||||
f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
|
||||
)
|
||||
num_images += N_i
|
||||
|
||||
# latents checks
|
||||
if latents is not None:
|
||||
if not torch.is_tensor(latents):
|
||||
raise ValueError("`latents` must be a torch.Tensor.")
|
||||
if latents.dim() != 4:
|
||||
raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
|
||||
|
||||
if processing_resolution > 0:
|
||||
max_orig = max(H, W)
|
||||
new_H = H * processing_resolution // max_orig
|
||||
new_W = W * processing_resolution // max_orig
|
||||
if new_H == 0 or new_W == 0:
|
||||
raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
|
||||
W, H = new_W, new_H
|
||||
w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
|
||||
h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
|
||||
shape_expected = (num_images * ensemble_size, self.unet.config.out_channels, h, w)
|
||||
|
||||
if latents.shape != shape_expected:
|
||||
raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
|
||||
|
||||
# generator checks
|
||||
if generator is not None:
|
||||
if isinstance(generator, list):
|
||||
if len(generator) != num_images * ensemble_size:
|
||||
raise ValueError(
|
||||
"The number of generators must match the total number of ensemble members for all input images."
|
||||
)
|
||||
if not all(g.device.type == generator[0].device.type for g in generator):
|
||||
raise ValueError("`generator` device placement is not consistent in the list.")
|
||||
elif not isinstance(generator, torch.Generator):
|
||||
raise ValueError(f"Unsupported generator type: {type(generator)}.")
|
||||
|
||||
return num_images
|
||||
|
||||
@torch.compiler.disable
|
||||
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
|
||||
if not hasattr(self, "_progress_bar_config"):
|
||||
self._progress_bar_config = {}
|
||||
elif not isinstance(self._progress_bar_config, dict):
|
||||
raise ValueError(
|
||||
f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
|
||||
)
|
||||
|
||||
progress_bar_config = dict(**self._progress_bar_config)
|
||||
progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
|
||||
progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
|
||||
if iterable is not None:
|
||||
return tqdm(iterable, **progress_bar_config)
|
||||
elif total is not None:
|
||||
return tqdm(total=total, **progress_bar_config)
|
||||
else:
|
||||
raise ValueError("Either `total` or `iterable` has to be defined.")
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
image: PipelineImageInput,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
ensemble_size: int = 1,
|
||||
processing_resolution: Optional[int] = None,
|
||||
match_input_resolution: bool = True,
|
||||
resample_method_input: str = "bilinear",
|
||||
resample_method_output: str = "bilinear",
|
||||
batch_size: int = 1,
|
||||
ensembling_kwargs: Optional[Dict[str, Any]] = None,
|
||||
latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
output_type: str = "np",
|
||||
output_uncertainty: bool = False,
|
||||
output_latent: bool = False,
|
||||
return_dict: bool = True,
|
||||
):
|
||||
"""
|
||||
Function invoked when calling the pipeline.
|
||||
|
||||
Args:
|
||||
image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
|
||||
`List[torch.Tensor]`: An input image or images used as an input for the intrinsic decomposition task.
|
||||
For arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is
|
||||
possible by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
|
||||
three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
|
||||
same width and height.
|
||||
num_inference_steps (`int`, *optional*, defaults to `None`):
|
||||
Number of denoising diffusion steps during inference. The default value `None` results in automatic
|
||||
selection.
|
||||
ensemble_size (`int`, defaults to `1`):
|
||||
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
|
||||
processing_resolution (`int`, *optional*, defaults to `None`):
|
||||
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
|
||||
produces crisper predictions, but may also lead to the overall loss of global context. The default
|
||||
value `None` resolves to the optimal value from the model config.
|
||||
match_input_resolution (`bool`, *optional*, defaults to `True`):
|
||||
When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
|
||||
side of the output will equal to `processing_resolution`.
|
||||
resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
|
||||
Resampling method used to resize input images to `processing_resolution`. The accepted values are:
|
||||
`"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
|
||||
resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
|
||||
Resampling method used to resize output predictions to match the input resolution. The accepted values
|
||||
are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
|
||||
batch_size (`int`, *optional*, defaults to `1`):
|
||||
Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
|
||||
ensembling_kwargs (`dict`, *optional*, defaults to `None`)
|
||||
Extra dictionary with arguments for precise ensembling control. The following options are available:
|
||||
- reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
|
||||
every pixel location, can be either `"median"` or `"mean"`.
|
||||
latents (`torch.Tensor`, *optional*, defaults to `None`):
|
||||
Latent noise tensors to replace the random initialization. These can be taken from the previous
|
||||
function call's output.
|
||||
generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
|
||||
Random number generator object to ensure reproducibility.
|
||||
output_type (`str`, *optional*, defaults to `"np"`):
|
||||
Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
|
||||
values are: `"np"` (numpy array) or `"pt"` (torch tensor).
|
||||
output_uncertainty (`bool`, *optional*, defaults to `False`):
|
||||
When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
|
||||
the `ensemble_size` argument is set to a value above 2.
|
||||
output_latent (`bool`, *optional*, defaults to `False`):
|
||||
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
|
||||
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
||||
`latents` argument.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.marigold.MarigoldIntrinsicsOutput`] instead of a plain tuple.
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.marigold.MarigoldIntrinsicsOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.marigold.MarigoldIntrinsicsOutput`] is returned, otherwise a
|
||||
`tuple` is returned where the first element is the prediction, the second element is the uncertainty
|
||||
(or `None`), and the third is the latent (or `None`).
|
||||
"""
|
||||
|
||||
# 0. Resolving variables.
|
||||
device = self._execution_device
|
||||
dtype = self.dtype
|
||||
|
||||
# Model-specific optimal default values leading to fast and reasonable results.
|
||||
if num_inference_steps is None:
|
||||
num_inference_steps = self.default_denoising_steps
|
||||
if processing_resolution is None:
|
||||
processing_resolution = self.default_processing_resolution
|
||||
|
||||
# 1. Check inputs.
|
||||
num_images = self.check_inputs(
|
||||
image,
|
||||
num_inference_steps,
|
||||
ensemble_size,
|
||||
processing_resolution,
|
||||
resample_method_input,
|
||||
resample_method_output,
|
||||
batch_size,
|
||||
ensembling_kwargs,
|
||||
latents,
|
||||
generator,
|
||||
output_type,
|
||||
output_uncertainty,
|
||||
)
|
||||
|
||||
# 2. Prepare empty text conditioning.
|
||||
# Model invocation: self.tokenizer, self.text_encoder.
|
||||
if self.empty_text_embedding is None:
|
||||
prompt = ""
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="do_not_pad",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids.to(device)
|
||||
self.empty_text_embedding = self.text_encoder(text_input_ids)[0] # [1,2,1024]
|
||||
|
||||
# 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
|
||||
# optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
|
||||
# `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
|
||||
# divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
|
||||
# of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
|
||||
# operation and leads to the most reasonable results. Using the native image resolution or any other processing
|
||||
# resolution can lead to loss of either fine details or global context in the output predictions.
|
||||
image, padding, original_resolution = self.image_processor.preprocess(
|
||||
image, processing_resolution, resample_method_input, device, dtype
|
||||
) # [N,3,PPH,PPW]
|
||||
|
||||
# 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
|
||||
# ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
|
||||
# Latents of each such predictions across all input images and all ensemble members are represented in the
|
||||
# `pred_latent` variable. The variable `image_latent` contains each input image encoded into latent space and
|
||||
# replicated `E` times. The variable `pred_latent` contains latents initialization, where the latent space is
|
||||
# replicated `T` times relative to the single latent space of `image_latent`, where `T` is the number of the
|
||||
# predicted targets. The latents can be either generated (see `generator` to ensure reproducibility), or passed
|
||||
# explicitly via the `latents` argument. The latter can be set outside the pipeline code. This behavior can be
|
||||
# achieved by setting the `output_latent` argument to `True`. The latent space dimensions are `(h, w)`. Encoding
|
||||
# into latent space happens in batches of size `batch_size`.
|
||||
# Model invocation: self.vae.encoder.
|
||||
image_latent, pred_latent = self.prepare_latents(
|
||||
image, latents, generator, ensemble_size, batch_size
|
||||
) # [N*E,4,h,w], [N*E,T*4,h,w]
|
||||
|
||||
del image
|
||||
|
||||
batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
|
||||
batch_size, 1, 1
|
||||
) # [B,1024,2]
|
||||
|
||||
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
|
||||
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
|
||||
# outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
|
||||
# `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
|
||||
# model.
|
||||
# Model invocation: self.unet.
|
||||
pred_latents = []
|
||||
|
||||
for i in self.progress_bar(
|
||||
range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
|
||||
):
|
||||
batch_image_latent = image_latent[i : i + batch_size] # [B,4,h,w]
|
||||
batch_pred_latent = pred_latent[i : i + batch_size] # [B,T*4,h,w]
|
||||
effective_batch_size = batch_image_latent.shape[0]
|
||||
text = batch_empty_text_embedding[:effective_batch_size] # [B,2,1024]
|
||||
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
|
||||
batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1) # [B,(1+T)*4,h,w]
|
||||
noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0] # [B,T*4,h,w]
|
||||
batch_pred_latent = self.scheduler.step(
|
||||
noise, t, batch_pred_latent, generator=generator
|
||||
).prev_sample # [B,T*4,h,w]
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
pred_latents.append(batch_pred_latent)
|
||||
|
||||
pred_latent = torch.cat(pred_latents, dim=0) # [N*E,T*4,h,w]
|
||||
|
||||
del (
|
||||
pred_latents,
|
||||
image_latent,
|
||||
batch_empty_text_embedding,
|
||||
batch_image_latent,
|
||||
batch_pred_latent,
|
||||
text,
|
||||
batch_latent,
|
||||
noise,
|
||||
)
|
||||
|
||||
# 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
|
||||
# which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
|
||||
# Model invocation: self.vae.decoder.
|
||||
pred_latent_for_decoding = pred_latent.reshape(
|
||||
num_images * ensemble_size * self.n_targets, self.vae.config.latent_channels, *pred_latent.shape[2:]
|
||||
) # [N*E*T,4,PPH,PPW]
|
||||
prediction = torch.cat(
|
||||
[
|
||||
self.decode_prediction(pred_latent_for_decoding[i : i + batch_size])
|
||||
for i in range(0, pred_latent_for_decoding.shape[0], batch_size)
|
||||
],
|
||||
dim=0,
|
||||
) # [N*E*T,3,PPH,PPW]
|
||||
|
||||
del pred_latent_for_decoding
|
||||
if not output_latent:
|
||||
pred_latent = None
|
||||
|
||||
# 7. Remove padding. The output shape is (PH, PW).
|
||||
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E*T,3,PH,PW]
|
||||
|
||||
# 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N*T`
|
||||
# groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
|
||||
# `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
|
||||
# each group independently, it stacks them respectively into batches of `N*T` almost final predictions and
|
||||
# uncertainty maps.
|
||||
uncertainty = None
|
||||
if ensemble_size > 1:
|
||||
prediction = prediction.reshape(
|
||||
num_images, ensemble_size, self.n_targets, *prediction.shape[1:]
|
||||
) # [N,E,T,3,PH,PW]
|
||||
prediction = [
|
||||
self.ensemble_intrinsics(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
|
||||
for i in range(num_images)
|
||||
] # [ [[T,3,PH,PW], [T,3,PH,PW]], ... ]
|
||||
prediction, uncertainty = zip(*prediction) # [[T,3,PH,PW], ... ], [[T,3,PH,PW], ... ]
|
||||
prediction = torch.cat(prediction, dim=0) # [N*T,3,PH,PW]
|
||||
if output_uncertainty:
|
||||
uncertainty = torch.cat(uncertainty, dim=0) # [N*T,3,PH,PW]
|
||||
else:
|
||||
uncertainty = None
|
||||
|
||||
# 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
|
||||
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
|
||||
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
|
||||
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
|
||||
if match_input_resolution:
|
||||
prediction = self.image_processor.resize_antialias(
|
||||
prediction, original_resolution, resample_method_output, is_aa=False
|
||||
) # [N*T,3,H,W]
|
||||
if uncertainty is not None and output_uncertainty:
|
||||
uncertainty = self.image_processor.resize_antialias(
|
||||
uncertainty, original_resolution, resample_method_output, is_aa=False
|
||||
) # [N*T,1,H,W]
|
||||
|
||||
# 10. Prepare the final outputs.
|
||||
if output_type == "np":
|
||||
prediction = self.image_processor.pt_to_numpy(prediction) # [N*T,H,W,3]
|
||||
if uncertainty is not None and output_uncertainty:
|
||||
uncertainty = self.image_processor.pt_to_numpy(uncertainty) # [N*T,H,W,3]
|
||||
|
||||
# 11. Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (prediction, uncertainty, pred_latent)
|
||||
|
||||
return MarigoldIntrinsicsOutput(
|
||||
prediction=prediction,
|
||||
uncertainty=uncertainty,
|
||||
latent=pred_latent,
|
||||
)
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
image: torch.Tensor,
|
||||
latents: Optional[torch.Tensor],
|
||||
generator: Optional[torch.Generator],
|
||||
ensemble_size: int,
|
||||
batch_size: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def retrieve_latents(encoder_output):
|
||||
if hasattr(encoder_output, "latent_dist"):
|
||||
return encoder_output.latent_dist.mode()
|
||||
elif hasattr(encoder_output, "latents"):
|
||||
return encoder_output.latents
|
||||
else:
|
||||
raise AttributeError("Could not access latents of provided encoder_output")
|
||||
|
||||
image_latent = torch.cat(
|
||||
[
|
||||
retrieve_latents(self.vae.encode(image[i : i + batch_size]))
|
||||
for i in range(0, image.shape[0], batch_size)
|
||||
],
|
||||
dim=0,
|
||||
) # [N,4,h,w]
|
||||
image_latent = image_latent * self.vae.config.scaling_factor
|
||||
image_latent = image_latent.repeat_interleave(ensemble_size, dim=0) # [N*E,4,h,w]
|
||||
N_E, C, H, W = image_latent.shape
|
||||
|
||||
pred_latent = latents
|
||||
if pred_latent is None:
|
||||
pred_latent = randn_tensor(
|
||||
(N_E, self.n_targets * C, H, W),
|
||||
generator=generator,
|
||||
device=image_latent.device,
|
||||
dtype=image_latent.dtype,
|
||||
) # [N*E,T*4,h,w]
|
||||
|
||||
return image_latent, pred_latent
|
||||
|
||||
def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
|
||||
if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
|
||||
raise ValueError(
|
||||
f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
|
||||
)
|
||||
|
||||
prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0] # [B,3,H,W]
|
||||
|
||||
prediction = torch.clip(prediction, -1.0, 1.0) # [B,3,H,W]
|
||||
prediction = (prediction + 1.0) / 2.0
|
||||
|
||||
return prediction # [B,3,H,W]
|
||||
|
||||
@staticmethod
|
||||
def ensemble_intrinsics(
|
||||
targets: torch.Tensor,
|
||||
output_uncertainty: bool = False,
|
||||
reduction: str = "median",
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
"""
|
||||
Ensembles the intrinsic decomposition represented by the `targets` tensor with expected shape `(B, T, 3, H,
|
||||
W)`, where B is the number of ensemble members for a given prediction of size `(H x W)`, and T is the number of
|
||||
predicted targets.
|
||||
|
||||
Args:
|
||||
targets (`torch.Tensor`):
|
||||
Input ensemble of intrinsic image decomposition maps.
|
||||
output_uncertainty (`bool`, *optional*, defaults to `False`):
|
||||
Whether to output uncertainty map.
|
||||
reduction (`str`, *optional*, defaults to `"mean"`):
|
||||
Reduction method used to ensemble aligned predictions. The accepted values are: `"median"` and
|
||||
`"mean"`.
|
||||
|
||||
Returns:
|
||||
A tensor of aligned and ensembled intrinsic decomposition maps with shape `(T, 3, H, W)` and optionally a
|
||||
tensor of uncertainties of shape `(T, 3, H, W)`.
|
||||
"""
|
||||
if targets.dim() != 5 or targets.shape[2] != 3:
|
||||
raise ValueError(f"Expecting 4D tensor of shape [B,T,3,H,W]; got {targets.shape}.")
|
||||
if reduction not in ("median", "mean"):
|
||||
raise ValueError(f"Unrecognized reduction method: {reduction}.")
|
||||
|
||||
B, T, _, H, W = targets.shape
|
||||
uncertainty = None
|
||||
if reduction == "mean":
|
||||
prediction = torch.mean(targets, dim=0) # [T,3,H,W]
|
||||
if output_uncertainty:
|
||||
uncertainty = torch.std(targets, dim=0) # [T,3,H,W]
|
||||
elif reduction == "median":
|
||||
prediction = torch.median(targets, dim=0, keepdim=True).values # [1,T,3,H,W]
|
||||
if output_uncertainty:
|
||||
uncertainty = torch.abs(targets - prediction) # [B,T,3,H,W]
|
||||
uncertainty = torch.median(uncertainty, dim=0).values # [T,3,H,W]
|
||||
prediction = prediction.squeeze(0) # [T,3,H,W]
|
||||
else:
|
||||
raise ValueError(f"Unrecognized reduction method: {reduction}.")
|
||||
return prediction, uncertainty
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# Marigold project website: https://marigoldmonodepth.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
@@ -62,7 +62,7 @@ Examples:
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
|
||||
... "prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
|
||||
... "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
@@ -81,12 +81,11 @@ class MarigoldNormalsOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
prediction (`np.ndarray`, `torch.Tensor`):
|
||||
Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
|
||||
width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
|
||||
Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
|
||||
\times width$, regardless of whether the images were passed as a 4D array or a list.
|
||||
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
|
||||
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
|
||||
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
|
||||
for `np.ndarray`.
|
||||
\times 1 \times height \times width$.
|
||||
latent (`None`, `torch.Tensor`):
|
||||
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
||||
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
||||
@@ -165,7 +164,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
self.register_to_config(
|
||||
prediction_type=prediction_type,
|
||||
use_full_z_range=use_full_z_range,
|
||||
default_denoising_steps=default_denoising_steps,
|
||||
default_processing_resolution=default_processing_resolution,
|
||||
@@ -196,11 +194,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
output_type: str,
|
||||
output_uncertainty: bool,
|
||||
) -> int:
|
||||
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
if actual_vae_scale_factor != self.vae_scale_factor:
|
||||
raise ValueError(
|
||||
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
|
||||
)
|
||||
if num_inference_steps is None:
|
||||
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
|
||||
if num_inference_steps < 1:
|
||||
@@ -311,7 +304,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
|
||||
return num_images
|
||||
|
||||
@torch.compiler.disable
|
||||
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
|
||||
if not hasattr(self, "_progress_bar_config"):
|
||||
self._progress_bar_config = {}
|
||||
@@ -362,9 +354,11 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
same width and height.
|
||||
num_inference_steps (`int`, *optional*, defaults to `None`):
|
||||
Number of denoising diffusion steps during inference. The default value `None` results in automatic
|
||||
selection.
|
||||
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
|
||||
for Marigold-LCM models.
|
||||
ensemble_size (`int`, defaults to `1`):
|
||||
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
|
||||
Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
|
||||
faster inference.
|
||||
processing_resolution (`int`, *optional*, defaults to `None`):
|
||||
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
|
||||
produces crisper predictions, but may also lead to the overall loss of global context. The default
|
||||
@@ -400,7 +394,7 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
||||
`latents` argument.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.marigold.MarigoldNormalsOutput`] instead of a plain tuple.
|
||||
Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -468,7 +462,9 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
|
||||
# `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
|
||||
# into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
|
||||
# reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
|
||||
# code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
|
||||
# code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
|
||||
# as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
|
||||
# noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
|
||||
# dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
|
||||
# Model invocation: self.vae.encoder.
|
||||
image_latent, pred_latent = self.prepare_latents(
|
||||
|
||||
@@ -17,7 +17,7 @@ import os
|
||||
import re
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union, get_args, get_origin
|
||||
|
||||
import requests
|
||||
import torch
|
||||
@@ -1059,3 +1059,76 @@ def _maybe_raise_error_for_incorrect_transformers(config_dict):
|
||||
break
|
||||
if has_transformers_component and not is_transformers_version(">", "4.47.1"):
|
||||
raise ValueError("Please upgrade your `transformers` installation to the latest version to use DDUF.")
|
||||
|
||||
|
||||
def _is_valid_type(obj: Any, class_or_tuple: Union[Type, Tuple[Type, ...]]) -> bool:
|
||||
"""
|
||||
Checks if an object is an instance of any of the provided types. For collections, it checks if every element is of
|
||||
the correct type as well.
|
||||
"""
|
||||
if not isinstance(class_or_tuple, tuple):
|
||||
class_or_tuple = (class_or_tuple,)
|
||||
|
||||
# Unpack unions
|
||||
unpacked_class_or_tuple = []
|
||||
for t in class_or_tuple:
|
||||
if get_origin(t) is Union:
|
||||
unpacked_class_or_tuple.extend(get_args(t))
|
||||
else:
|
||||
unpacked_class_or_tuple.append(t)
|
||||
class_or_tuple = tuple(unpacked_class_or_tuple)
|
||||
|
||||
if Any in class_or_tuple:
|
||||
return True
|
||||
|
||||
obj_type = type(obj)
|
||||
# Classes with obj's type
|
||||
class_or_tuple = {t for t in class_or_tuple if isinstance(obj, get_origin(t) or t)}
|
||||
|
||||
# Singular types (e.g. int, ControlNet, ...)
|
||||
# Untyped collections (e.g. List, but not List[int])
|
||||
elem_class_or_tuple = {get_args(t) for t in class_or_tuple}
|
||||
if () in elem_class_or_tuple:
|
||||
return True
|
||||
# Typed lists or sets
|
||||
elif obj_type in (list, set):
|
||||
return any(all(_is_valid_type(x, t) for x in obj) for t in elem_class_or_tuple)
|
||||
# Typed tuples
|
||||
elif obj_type is tuple:
|
||||
return any(
|
||||
# Tuples with any length and single type (e.g. Tuple[int, ...])
|
||||
(len(t) == 2 and t[-1] is Ellipsis and all(_is_valid_type(x, t[0]) for x in obj))
|
||||
or
|
||||
# Tuples with fixed length and any types (e.g. Tuple[int, str])
|
||||
(len(obj) == len(t) and all(_is_valid_type(x, tt) for x, tt in zip(obj, t)))
|
||||
for t in elem_class_or_tuple
|
||||
)
|
||||
# Typed dicts
|
||||
elif obj_type is dict:
|
||||
return any(
|
||||
all(_is_valid_type(k, kt) and _is_valid_type(v, vt) for k, v in obj.items())
|
||||
for kt, vt in elem_class_or_tuple
|
||||
)
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _get_detailed_type(obj: Any) -> Type:
|
||||
"""
|
||||
Gets a detailed type for an object, including nested types for collections.
|
||||
"""
|
||||
obj_type = type(obj)
|
||||
|
||||
if obj_type in (list, set):
|
||||
obj_origin_type = List if obj_type is list else Set
|
||||
elems_type = Union[tuple({_get_detailed_type(x) for x in obj})]
|
||||
return obj_origin_type[elems_type]
|
||||
elif obj_type is tuple:
|
||||
return Tuple[tuple(_get_detailed_type(x) for x in obj)]
|
||||
elif obj_type is dict:
|
||||
keys_type = Union[tuple({_get_detailed_type(k) for k in obj.keys()})]
|
||||
values_type = Union[tuple({_get_detailed_type(k) for k in obj.values()})]
|
||||
return Dict[keys_type, values_type]
|
||||
else:
|
||||
return obj_type
|
||||
|
||||
@@ -54,8 +54,6 @@ from ..utils import (
|
||||
DEPRECATED_REVISION_ARGS,
|
||||
BaseOutput,
|
||||
PushToHubMixin,
|
||||
_get_detailed_type,
|
||||
_is_valid_type,
|
||||
is_accelerate_available,
|
||||
is_accelerate_version,
|
||||
is_torch_npu_available,
|
||||
@@ -80,10 +78,12 @@ from .pipeline_loading_utils import (
|
||||
_fetch_class_library_tuple,
|
||||
_get_custom_components_and_folders,
|
||||
_get_custom_pipeline_class,
|
||||
_get_detailed_type,
|
||||
_get_final_device_map,
|
||||
_get_ignore_patterns,
|
||||
_get_pipeline_class,
|
||||
_identify_model_variants,
|
||||
_is_valid_type,
|
||||
_maybe_raise_error_for_incorrect_transformers,
|
||||
_maybe_raise_warning_for_inpainting,
|
||||
_resolve_custom_pipeline_and_cls,
|
||||
@@ -685,7 +685,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
token = kwargs.pop("token", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
from_flax = kwargs.pop("from_flax", False)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
custom_pipeline = kwargs.pop("custom_pipeline", None)
|
||||
custom_revision = kwargs.pop("custom_revision", None)
|
||||
provider = kwargs.pop("provider", None)
|
||||
@@ -702,12 +702,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
use_onnx = kwargs.pop("use_onnx", None)
|
||||
load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
|
||||
|
||||
if not isinstance(torch_dtype, torch.dtype):
|
||||
torch_dtype = torch.float32
|
||||
logger.warning(
|
||||
f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage and not is_accelerate_available():
|
||||
low_cpu_mem_usage = False
|
||||
logger.warning(
|
||||
@@ -1832,7 +1826,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
"""
|
||||
|
||||
original_config = dict(pipeline.config)
|
||||
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
|
||||
# derive the pipeline class to instantiate
|
||||
custom_pipeline = kwargs.pop("custom_pipeline", None)
|
||||
|
||||
@@ -123,7 +123,6 @@ from .state_dict_utils import (
|
||||
convert_state_dict_to_peft,
|
||||
convert_unet_state_dict_to_peft,
|
||||
)
|
||||
from .typing_utils import _get_detailed_type, _is_valid_type
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -1217,21 +1217,6 @@ class MarigoldDepthPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class MarigoldIntrinsicsPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class MarigoldNormalsPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
@@ -1532,21 +1517,6 @@ class StableCascadePriorPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class StableDiffusion3ControlNetInpaintingPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class StableDiffusion3ControlNetPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Typing utilities: Utilities related to type checking and validation
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List, Set, Tuple, Type, Union, get_args, get_origin
|
||||
|
||||
|
||||
def _is_valid_type(obj: Any, class_or_tuple: Union[Type, Tuple[Type, ...]]) -> bool:
|
||||
"""
|
||||
Checks if an object is an instance of any of the provided types. For collections, it checks if every element is of
|
||||
the correct type as well.
|
||||
"""
|
||||
if not isinstance(class_or_tuple, tuple):
|
||||
class_or_tuple = (class_or_tuple,)
|
||||
|
||||
# Unpack unions
|
||||
unpacked_class_or_tuple = []
|
||||
for t in class_or_tuple:
|
||||
if get_origin(t) is Union:
|
||||
unpacked_class_or_tuple.extend(get_args(t))
|
||||
else:
|
||||
unpacked_class_or_tuple.append(t)
|
||||
class_or_tuple = tuple(unpacked_class_or_tuple)
|
||||
|
||||
if Any in class_or_tuple:
|
||||
return True
|
||||
|
||||
obj_type = type(obj)
|
||||
# Classes with obj's type
|
||||
class_or_tuple = {t for t in class_or_tuple if isinstance(obj, get_origin(t) or t)}
|
||||
|
||||
# Singular types (e.g. int, ControlNet, ...)
|
||||
# Untyped collections (e.g. List, but not List[int])
|
||||
elem_class_or_tuple = {get_args(t) for t in class_or_tuple}
|
||||
if () in elem_class_or_tuple:
|
||||
return True
|
||||
# Typed lists or sets
|
||||
elif obj_type in (list, set):
|
||||
return any(all(_is_valid_type(x, t) for x in obj) for t in elem_class_or_tuple)
|
||||
# Typed tuples
|
||||
elif obj_type is tuple:
|
||||
return any(
|
||||
# Tuples with any length and single type (e.g. Tuple[int, ...])
|
||||
(len(t) == 2 and t[-1] is Ellipsis and all(_is_valid_type(x, t[0]) for x in obj))
|
||||
or
|
||||
# Tuples with fixed length and any types (e.g. Tuple[int, str])
|
||||
(len(obj) == len(t) and all(_is_valid_type(x, tt) for x, tt in zip(obj, t)))
|
||||
for t in elem_class_or_tuple
|
||||
)
|
||||
# Typed dicts
|
||||
elif obj_type is dict:
|
||||
return any(
|
||||
all(_is_valid_type(k, kt) and _is_valid_type(v, vt) for k, v in obj.items())
|
||||
for kt, vt in elem_class_or_tuple
|
||||
)
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _get_detailed_type(obj: Any) -> Type:
|
||||
"""
|
||||
Gets a detailed type for an object, including nested types for collections.
|
||||
"""
|
||||
obj_type = type(obj)
|
||||
|
||||
if obj_type in (list, set):
|
||||
obj_origin_type = List if obj_type is list else Set
|
||||
elems_type = Union[tuple({_get_detailed_type(x) for x in obj})]
|
||||
return obj_origin_type[elems_type]
|
||||
elif obj_type is tuple:
|
||||
return Tuple[tuple(_get_detailed_type(x) for x in obj)]
|
||||
elif obj_type is dict:
|
||||
keys_type = Union[tuple({_get_detailed_type(k) for k in obj.keys()})]
|
||||
values_type = Union[tuple({_get_detailed_type(k) for k in obj.values()})]
|
||||
return Dict[keys_type, values_type]
|
||||
else:
|
||||
return obj_type
|
||||
@@ -15,8 +15,6 @@
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer, GemmaForCausalLM
|
||||
|
||||
@@ -26,12 +24,12 @@ from diffusers import (
|
||||
Lumina2Text2ImgPipeline,
|
||||
Lumina2Transformer2DModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import floats_tensor, is_torch_version, require_peft_backend, skip_mps, torch_device
|
||||
from diffusers.utils.testing_utils import floats_tensor, require_peft_backend
|
||||
|
||||
|
||||
sys.path.append(".")
|
||||
|
||||
from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set # noqa: E402
|
||||
from utils import PeftLoraLoaderMixinTests # noqa: E402
|
||||
|
||||
|
||||
@require_peft_backend
|
||||
@@ -132,41 +130,3 @@ class Lumina2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
|
||||
@unittest.skip("Text encoder LoRA is not supported in Lumina2.")
|
||||
def test_simple_inference_with_text_lora_save_load(self):
|
||||
pass
|
||||
|
||||
@skip_mps
|
||||
@pytest.mark.xfail(
|
||||
condition=torch.device(torch_device).type == "cpu" and is_torch_version(">=", "2.5"),
|
||||
reason="Test currently fails on CPU and PyTorch 2.5.1 but not on PyTorch 2.4.1.",
|
||||
strict=False,
|
||||
)
|
||||
def test_lora_fuse_nan(self):
|
||||
for scheduler_cls in self.scheduler_classes:
|
||||
components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
_, _, inputs = self.get_dummy_inputs(with_generator=False)
|
||||
|
||||
if "text_encoder" in self.pipeline_class._lora_loadable_modules:
|
||||
pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
|
||||
self.assertTrue(
|
||||
check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
|
||||
)
|
||||
|
||||
denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
|
||||
denoiser.add_adapter(denoiser_lora_config, "adapter-1")
|
||||
self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
|
||||
|
||||
# corrupt one LoRA weight with `inf` values
|
||||
with torch.no_grad():
|
||||
pipe.transformer.layers[0].attn.to_q.lora_A["adapter-1"].weight += float("inf")
|
||||
|
||||
# with `safe_fusing=True` we should see an Error
|
||||
with self.assertRaises(ValueError):
|
||||
pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
|
||||
|
||||
# without we should not see an error, but every image will be black
|
||||
pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
|
||||
out = pipe(**inputs)[0]
|
||||
|
||||
self.assertTrue(np.isnan(out).all())
|
||||
|
||||
@@ -1169,16 +1169,17 @@ class ModelTesterMixin:
|
||||
base_output = model(**inputs_dict)
|
||||
|
||||
model_size = compute_module_sizes(model)[""]
|
||||
max_size = int(self.model_split_percents[0] * model_size)
|
||||
# Force disk offload by setting very small CPU memory
|
||||
max_memory = {0: max_size, "cpu": int(0.1 * max_size)}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
max_size = int(self.model_split_percents[0] * model_size)
|
||||
max_memory = {0: max_size, "cpu": max_size}
|
||||
# This errors out because it's missing an offload folder
|
||||
new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
|
||||
|
||||
max_size = int(self.model_split_percents[0] * model_size)
|
||||
max_memory = {0: max_size, "cpu": max_size}
|
||||
new_model = self.model_class.from_pretrained(
|
||||
tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
|
||||
)
|
||||
|
||||
@@ -30,7 +30,6 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = OmniGenTransformer2DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
model_split_percents = [0.1, 0.1, 0.1]
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
@@ -74,9 +73,9 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 4,
|
||||
"intermediate_size": 32,
|
||||
"num_layers": 20,
|
||||
"num_layers": 1,
|
||||
"pad_token_id": 0,
|
||||
"vocab_size": 1000,
|
||||
"vocab_size": 100,
|
||||
"in_channels": 4,
|
||||
"time_step_dim": 4,
|
||||
"rope_scaling": {"long_factor": list(range(1, 3)), "short_factor": list(range(1, 3))},
|
||||
|
||||
@@ -33,7 +33,6 @@ enable_full_determinism()
|
||||
class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = SD3Transformer2DModel
|
||||
main_input_name = "hidden_states"
|
||||
model_split_percents = [0.8, 0.8, 0.9]
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
@@ -68,7 +67,7 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
"sample_size": 32,
|
||||
"patch_size": 1,
|
||||
"in_channels": 4,
|
||||
"num_layers": 4,
|
||||
"num_layers": 1,
|
||||
"attention_head_dim": 8,
|
||||
"num_attention_heads": 4,
|
||||
"caption_projection_dim": 32,
|
||||
@@ -108,7 +107,6 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = SD3Transformer2DModel
|
||||
main_input_name = "hidden_states"
|
||||
model_split_percents = [0.8, 0.8, 0.9]
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
@@ -143,7 +141,7 @@ class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
"sample_size": 32,
|
||||
"patch_size": 1,
|
||||
"in_channels": 4,
|
||||
"num_layers": 4,
|
||||
"num_layers": 2,
|
||||
"attention_head_dim": 8,
|
||||
"num_attention_heads": 4,
|
||||
"caption_projection_dim": 32,
|
||||
|
||||
@@ -89,9 +89,7 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder = ChatGLMModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
|
||||
)
|
||||
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -93,9 +93,7 @@ class KolorsPipelineImg2ImgFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder = ChatGLMModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
|
||||
)
|
||||
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# Marigold project website: https://marigoldmonodepth.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
import gc
|
||||
import random
|
||||
|
||||
@@ -1,571 +0,0 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
import gc
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
import diffusers
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
AutoencoderTiny,
|
||||
DDIMScheduler,
|
||||
MarigoldIntrinsicsPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, to_np
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class MarigoldIntrinsicsPipelineTesterMixin(PipelineTesterMixin):
|
||||
def _test_inference_batch_single_identical(
|
||||
self,
|
||||
batch_size=2,
|
||||
expected_max_diff=1e-4,
|
||||
additional_params_copy_to_batched_inputs=["num_inference_steps"],
|
||||
):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for components in pipe.components.values():
|
||||
if hasattr(components, "set_default_attn_processor"):
|
||||
components.set_default_attn_processor()
|
||||
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
# Reset generator in case it is has been used in self.get_dummy_inputs
|
||||
inputs["generator"] = self.get_generator(0)
|
||||
|
||||
logger = diffusers.logging.get_logger(pipe.__module__)
|
||||
logger.setLevel(level=diffusers.logging.FATAL)
|
||||
|
||||
# batchify inputs
|
||||
batched_inputs = {}
|
||||
batched_inputs.update(inputs)
|
||||
|
||||
for name in self.batch_params:
|
||||
if name not in inputs:
|
||||
continue
|
||||
|
||||
value = inputs[name]
|
||||
if name == "prompt":
|
||||
len_prompt = len(value)
|
||||
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
|
||||
batched_inputs[name][-1] = 100 * "very long"
|
||||
|
||||
else:
|
||||
batched_inputs[name] = batch_size * [value]
|
||||
|
||||
if "generator" in inputs:
|
||||
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
|
||||
|
||||
if "batch_size" in inputs:
|
||||
batched_inputs["batch_size"] = batch_size
|
||||
|
||||
for arg in additional_params_copy_to_batched_inputs:
|
||||
batched_inputs[arg] = inputs[arg]
|
||||
|
||||
output = pipe(**inputs)
|
||||
output_batch = pipe(**batched_inputs)
|
||||
|
||||
assert output_batch[0].shape[0] == batch_size * output[0].shape[0] # only changed here
|
||||
|
||||
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
|
||||
assert max_diff < expected_max_diff
|
||||
|
||||
def _test_inference_batch_consistent(
|
||||
self, batch_sizes=[2], additional_params_copy_to_batched_inputs=["num_inference_steps"], batch_generator=True
|
||||
):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["generator"] = self.get_generator(0)
|
||||
|
||||
logger = diffusers.logging.get_logger(pipe.__module__)
|
||||
logger.setLevel(level=diffusers.logging.FATAL)
|
||||
|
||||
# prepare batched inputs
|
||||
batched_inputs = []
|
||||
for batch_size in batch_sizes:
|
||||
batched_input = {}
|
||||
batched_input.update(inputs)
|
||||
|
||||
for name in self.batch_params:
|
||||
if name not in inputs:
|
||||
continue
|
||||
|
||||
value = inputs[name]
|
||||
if name == "prompt":
|
||||
len_prompt = len(value)
|
||||
# make unequal batch sizes
|
||||
batched_input[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
|
||||
|
||||
# make last batch super long
|
||||
batched_input[name][-1] = 100 * "very long"
|
||||
|
||||
else:
|
||||
batched_input[name] = batch_size * [value]
|
||||
|
||||
if batch_generator and "generator" in inputs:
|
||||
batched_input["generator"] = [self.get_generator(i) for i in range(batch_size)]
|
||||
|
||||
if "batch_size" in inputs:
|
||||
batched_input["batch_size"] = batch_size
|
||||
|
||||
batched_inputs.append(batched_input)
|
||||
|
||||
logger.setLevel(level=diffusers.logging.WARNING)
|
||||
for batch_size, batched_input in zip(batch_sizes, batched_inputs):
|
||||
output = pipe(**batched_input)
|
||||
assert len(output[0]) == batch_size * pipe.n_targets # only changed here
|
||||
|
||||
|
||||
class MarigoldIntrinsicsPipelineFastTests(MarigoldIntrinsicsPipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = MarigoldIntrinsicsPipeline
|
||||
params = frozenset(["image"])
|
||||
batch_params = frozenset(["image"])
|
||||
image_params = frozenset(["image"])
|
||||
image_latents_params = frozenset(["latents"])
|
||||
callback_cfg_params = frozenset([])
|
||||
test_xformers_attention = False
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"output_type",
|
||||
]
|
||||
)
|
||||
|
||||
def get_dummy_components(self, time_cond_proj_dim=None):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
time_cond_proj_dim=time_cond_proj_dim,
|
||||
sample_size=32,
|
||||
in_channels=12,
|
||||
out_channels=8,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
prediction_type="v_prediction",
|
||||
set_alpha_to_one=False,
|
||||
steps_offset=1,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
thresholding=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"prediction_type": "intrinsics",
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_tiny_autoencoder(self):
|
||||
return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
image = image / 2 + 0.5
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"image": image,
|
||||
"num_inference_steps": 1,
|
||||
"processing_resolution": 0,
|
||||
"generator": generator,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def _test_marigold_intrinsics(
|
||||
self,
|
||||
generator_seed: int = 0,
|
||||
expected_slice: np.ndarray = None,
|
||||
atol: float = 1e-4,
|
||||
**pipe_kwargs,
|
||||
):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
|
||||
pipe_inputs.update(**pipe_kwargs)
|
||||
|
||||
prediction = pipe(**pipe_inputs).prediction
|
||||
|
||||
prediction_slice = prediction[0, -3:, -3:, -1].flatten()
|
||||
|
||||
if pipe_inputs.get("match_input_resolution", True):
|
||||
self.assertEqual(prediction.shape, (2, 32, 32, 3), "Unexpected output resolution")
|
||||
else:
|
||||
self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
|
||||
self.assertEqual(
|
||||
max(prediction.shape[1:3]),
|
||||
pipe_inputs.get("processing_resolution", 768),
|
||||
"Unexpected output resolution",
|
||||
)
|
||||
|
||||
np.set_printoptions(precision=5, suppress=True)
|
||||
msg = f"{prediction_slice}"
|
||||
self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
|
||||
# self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
|
||||
|
||||
def test_marigold_depth_dummy_defaults(self):
|
||||
self._test_marigold_intrinsics(
|
||||
expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=32,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.53132, 0.44487, 0.40164, 0.5326, 0.49073, 0.46979, 0.53324, 0.51366, 0.50387]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=16,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=2024,
|
||||
expected_slice=np.array([0.40257, 0.39468, 0.51373, 0.4161, 0.40162, 0.58535, 0.43581, 0.47834, 0.48951]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=32,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.49636, 0.4518, 0.42722, 0.59044, 0.6362, 0.39011, 0.53522, 0.55153, 0.48699]),
|
||||
num_inference_steps=2,
|
||||
processing_resolution=32,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.55547, 0.43511, 0.4887, 0.56399, 0.63867, 0.56337, 0.47889, 0.52925, 0.49235]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=64,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.57249, 0.49824, 0.54438, 0.57733, 0.52404, 0.5255, 0.56493, 0.56336, 0.48579]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=32,
|
||||
ensemble_size=3,
|
||||
ensembling_kwargs={"reduction": "mean"},
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.6294, 0.5575, 0.53414, 0.61077, 0.57156, 0.53974, 0.52956, 0.55467, 0.48751]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=32,
|
||||
ensemble_size=4,
|
||||
ensembling_kwargs={"reduction": "mean"},
|
||||
batch_size=2,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
|
||||
self._test_marigold_intrinsics(
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.63511, 0.68137, 0.48783, 0.46689, 0.58505, 0.36757, 0.58465, 0.54302, 0.50387]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=16,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=False,
|
||||
)
|
||||
|
||||
def test_marigold_depth_dummy_no_num_inference_steps(self):
|
||||
with self.assertRaises(ValueError) as e:
|
||||
self._test_marigold_intrinsics(
|
||||
num_inference_steps=None,
|
||||
expected_slice=np.array([0.0]),
|
||||
)
|
||||
self.assertIn("num_inference_steps", str(e))
|
||||
|
||||
def test_marigold_depth_dummy_no_processing_resolution(self):
|
||||
with self.assertRaises(ValueError) as e:
|
||||
self._test_marigold_intrinsics(
|
||||
processing_resolution=None,
|
||||
expected_slice=np.array([0.0]),
|
||||
)
|
||||
self.assertIn("processing_resolution", str(e))
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class MarigoldIntrinsicsPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def _test_marigold_intrinsics(
|
||||
self,
|
||||
is_fp16: bool = True,
|
||||
device: str = "cuda",
|
||||
generator_seed: int = 0,
|
||||
expected_slice: np.ndarray = None,
|
||||
model_id: str = "prs-eth/marigold-iid-appearance-v1-1",
|
||||
image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
|
||||
atol: float = 1e-4,
|
||||
**pipe_kwargs,
|
||||
):
|
||||
from_pretrained_kwargs = {}
|
||||
if is_fp16:
|
||||
from_pretrained_kwargs["variant"] = "fp16"
|
||||
from_pretrained_kwargs["torch_dtype"] = torch.float16
|
||||
|
||||
pipe = MarigoldIntrinsicsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
|
||||
if device == "cuda":
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(generator_seed)
|
||||
|
||||
image = load_image(image_url)
|
||||
width, height = image.size
|
||||
|
||||
prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
|
||||
|
||||
prediction_slice = prediction[0, -3:, -3:, -1].flatten()
|
||||
|
||||
if pipe_kwargs.get("match_input_resolution", True):
|
||||
self.assertEqual(prediction.shape, (2, height, width, 3), "Unexpected output resolution")
|
||||
else:
|
||||
self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
|
||||
self.assertEqual(
|
||||
max(prediction.shape[1:3]),
|
||||
pipe_kwargs.get("processing_resolution", 768),
|
||||
"Unexpected output resolution",
|
||||
)
|
||||
|
||||
msg = f"{prediction_slice}"
|
||||
self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
|
||||
# self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
|
||||
|
||||
def test_marigold_intrinsics_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=False,
|
||||
device="cpu",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=32,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=False,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.62127, 0.61906, 0.61687, 0.61946, 0.61903, 0.61961, 0.61808, 0.62099, 0.62894]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=768,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.62109, 0.61914, 0.61719, 0.61963, 0.61914, 0.61963, 0.61816, 0.62109, 0.62891]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=768,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=2024,
|
||||
expected_slice=np.array([0.64111, 0.63916, 0.63623, 0.63965, 0.63916, 0.63965, 0.6377, 0.64062, 0.64941]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=768,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.60254, 0.60059, 0.59961, 0.60156, 0.60107, 0.60205, 0.60254, 0.60449, 0.61133]),
|
||||
num_inference_steps=2,
|
||||
processing_resolution=768,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.64551, 0.64453, 0.64404, 0.64502, 0.64844, 0.65039, 0.64502, 0.65039, 0.65332]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=512,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.61572, 0.61377, 0.61182, 0.61426, 0.61377, 0.61426, 0.61279, 0.61572, 0.62354]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=768,
|
||||
ensemble_size=3,
|
||||
ensembling_kwargs={"reduction": "mean"},
|
||||
batch_size=1,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.61914, 0.6167, 0.61475, 0.61719, 0.61719, 0.61768, 0.61572, 0.61914, 0.62695]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=768,
|
||||
ensemble_size=4,
|
||||
ensembling_kwargs={"reduction": "mean"},
|
||||
batch_size=2,
|
||||
match_input_resolution=True,
|
||||
)
|
||||
|
||||
def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
|
||||
self._test_marigold_intrinsics(
|
||||
is_fp16=True,
|
||||
device="cuda",
|
||||
generator_seed=0,
|
||||
expected_slice=np.array([0.65332, 0.64697, 0.64648, 0.64844, 0.64697, 0.64111, 0.64941, 0.64209, 0.65332]),
|
||||
num_inference_steps=1,
|
||||
processing_resolution=512,
|
||||
ensemble_size=1,
|
||||
batch_size=1,
|
||||
match_input_resolution=False,
|
||||
)
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
||||
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
# --------------------------------------------------------------------------
|
||||
# More information and citation instructions are available on the
|
||||
# Marigold project website: https://marigoldcomputervision.github.io
|
||||
# Marigold project website: https://marigoldmonodepth.github.io
|
||||
# --------------------------------------------------------------------------
|
||||
import gc
|
||||
import random
|
||||
|
||||
@@ -98,9 +98,7 @@ class KolorsPAGPipelineFastTests(
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder = ChatGLMModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
|
||||
)
|
||||
text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -527,9 +527,7 @@ class FluxIPAdapterTesterMixin:
|
||||
|
||||
The following scenarios are tested:
|
||||
- Single IP-Adapter with scale=0 should produce same output as no IP-Adapter.
|
||||
- Multi IP-Adapter with scale=0 should produce same output as no IP-Adapter.
|
||||
- Single IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
|
||||
- Multi IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
|
||||
"""
|
||||
# Raising the tolerance for this test when it's run on a CPU because we
|
||||
# compare against static slices and that can be shaky (with a VVVV low probability).
|
||||
@@ -547,7 +545,6 @@ class FluxIPAdapterTesterMixin:
|
||||
else:
|
||||
output_without_adapter = expected_pipe_slice
|
||||
|
||||
# 1. Single IP-Adapter test cases
|
||||
adapter_state_dict = create_flux_ip_adapter_state_dict(pipe.transformer)
|
||||
pipe.transformer._load_ip_adapter_weights(adapter_state_dict)
|
||||
|
||||
@@ -581,44 +578,6 @@ class FluxIPAdapterTesterMixin:
|
||||
max_diff_with_adapter_scale, 1e-2, "Output with ip-adapter must be different from normal inference"
|
||||
)
|
||||
|
||||
# 2. Multi IP-Adapter test cases
|
||||
adapter_state_dict_1 = create_flux_ip_adapter_state_dict(pipe.transformer)
|
||||
adapter_state_dict_2 = create_flux_ip_adapter_state_dict(pipe.transformer)
|
||||
pipe.transformer._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
|
||||
|
||||
# forward pass with multi ip adapter, but scale=0 which should have no effect
|
||||
inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
|
||||
inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
|
||||
inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
|
||||
pipe.set_ip_adapter_scale([0.0, 0.0])
|
||||
output_without_multi_adapter_scale = pipe(**inputs)[0]
|
||||
if expected_pipe_slice is not None:
|
||||
output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
|
||||
|
||||
# forward pass with multi ip adapter, but with scale of adapter weights
|
||||
inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
|
||||
inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
|
||||
inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
|
||||
pipe.set_ip_adapter_scale([42.0, 42.0])
|
||||
output_with_multi_adapter_scale = pipe(**inputs)[0]
|
||||
if expected_pipe_slice is not None:
|
||||
output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
|
||||
|
||||
max_diff_without_multi_adapter_scale = np.abs(
|
||||
output_without_multi_adapter_scale - output_without_adapter
|
||||
).max()
|
||||
max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
|
||||
self.assertLess(
|
||||
max_diff_without_multi_adapter_scale,
|
||||
expected_max_diff,
|
||||
"Output without multi-ip-adapter must be same as normal inference",
|
||||
)
|
||||
self.assertGreater(
|
||||
max_diff_with_multi_adapter_scale,
|
||||
1e-2,
|
||||
"Output with multi-ip-adapter scale must be different from normal inference",
|
||||
)
|
||||
|
||||
|
||||
class PipelineLatentTesterMixin:
|
||||
"""
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
import argparse
|
||||
import inspect
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Type
|
||||
|
||||
|
||||
root_dir = Path(__file__).parent.parent.absolute()
|
||||
sys.path.insert(0, str(root_dir))
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--type", type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def get_test_methods_from_class(cls: Type) -> List[str]:
|
||||
"""
|
||||
Get all test method names from a given class.
|
||||
Only returns methods that start with 'test_'.
|
||||
"""
|
||||
test_methods = []
|
||||
for name, obj in inspect.getmembers(cls):
|
||||
if name.startswith("test_") and inspect.isfunction(obj):
|
||||
test_methods.append(name)
|
||||
return sorted(test_methods)
|
||||
|
||||
|
||||
def generate_pytest_pattern(test_methods: List[str]) -> str:
|
||||
"""Generate pytest pattern string for the -k flag."""
|
||||
return " or ".join(test_methods)
|
||||
|
||||
|
||||
def generate_pattern_for_mixin(mixin_class: Type) -> str:
|
||||
"""
|
||||
Generate pytest pattern for a specific mixin class.
|
||||
"""
|
||||
if mixin_cls is None:
|
||||
return ""
|
||||
test_methods = get_test_methods_from_class(mixin_class)
|
||||
return generate_pytest_pattern(test_methods)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mixin_cls = None
|
||||
if args.type == "pipeline":
|
||||
from tests.pipelines.test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
mixin_cls = PipelineTesterMixin
|
||||
|
||||
elif args.type == "models":
|
||||
from tests.models.test_modeling_common import ModelTesterMixin
|
||||
|
||||
mixin_cls = ModelTesterMixin
|
||||
|
||||
elif args.type == "lora":
|
||||
from tests.lora.utils import PeftLoraLoaderMixinTests
|
||||
|
||||
mixin_cls = PeftLoraLoaderMixinTests
|
||||
|
||||
pattern = generate_pattern_for_mixin(mixin_cls)
|
||||
print(pattern)
|
||||
Reference in New Issue
Block a user