Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02104ecb9f | |||
| 38dacfc6cd |
@@ -238,13 +238,12 @@ jobs:
|
||||
|
||||
run_flax_tpu_tests:
|
||||
name: Nightly Flax TPU Tests
|
||||
runs-on:
|
||||
group: gcp-ct5lp-hightpu-8t
|
||||
runs-on: docker-tpu
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -348,64 +347,6 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_quantization_tests:
|
||||
name: Torch quantization nightly tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
config:
|
||||
- backend: "bitsandbytes"
|
||||
test_location: "bnb"
|
||||
runs-on:
|
||||
group: aws-g6e-xlarge-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "20gb" --ipc host --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install -U ${{ matrix.config.backend }}
|
||||
python -m uv pip install pytest-reportlog
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
- name: ${{ matrix.config.backend }} quantization tests on GPU
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
BIG_GPU_MEMORY: 40
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.backend }}_torch_cuda \
|
||||
--report-log=tests_${{ matrix.config.backend }}_torch_cuda.log \
|
||||
tests/quantization/${{ matrix.config.test_location }}
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_${{ matrix.config.backend }}_torch_cuda_stats.txt
|
||||
cat reports/tests_${{ matrix.config.backend }}_torch_cuda_failures_short.txt
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: torch_cuda_${{ matrix.config.backend }}_reports
|
||||
path: reports
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
# M1 runner currently not well supported
|
||||
# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
|
||||
# run_nightly_tests_apple_m1:
|
||||
@@ -520,4 +461,4 @@ jobs:
|
||||
# if: always()
|
||||
# run: |
|
||||
# pip install slack_sdk tabulate
|
||||
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
@@ -0,0 +1,134 @@
|
||||
name: Fast tests for PRs - PEFT backend
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 4
|
||||
MKL_NUM_THREADS: 4
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
lib-versions: ["main", "latest"]
|
||||
|
||||
|
||||
name: LoRA - ${{ matrix.lib-versions }}
|
||||
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
# TODO (sayakpaul, DN6): revisit `--no-deps`
|
||||
if [ "${{ matrix.lib-versions }}" == "main" ]; then
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
else
|
||||
python -m uv pip install -U peft --no-deps
|
||||
python -m uv pip install -U transformers accelerate --no-deps
|
||||
fi
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch LoRA CPU tests with PEFT backend
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_${{ matrix.lib-versions }} \
|
||||
tests/lora/
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_models_lora_${{ matrix.lib-versions }} \
|
||||
tests/models/ -k "lora"
|
||||
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_${{ matrix.lib-versions }}_failures_short.txt
|
||||
cat reports/tests_models_lora_${{ matrix.lib-versions }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: pr_${{ matrix.lib-versions }}_test_reports
|
||||
path: reports
|
||||
@@ -234,67 +234,3 @@ jobs:
|
||||
with:
|
||||
name: pr_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
run_lora_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
||||
name: LoRA tests with PEFT main
|
||||
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
# TODO (sayakpaul, DN6): revisit `--no-deps`
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch LoRA tests with PEFT
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_peft_main \
|
||||
tests/lora/
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_models_lora_peft_main \
|
||||
tests/models/ -k "lora"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_lora_failures_short.txt
|
||||
cat reports/tests_models_lora_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: pr_main_test_reports
|
||||
path: reports
|
||||
|
||||
|
||||
@@ -161,11 +161,11 @@ jobs:
|
||||
|
||||
flax_tpu_tests:
|
||||
name: Flax TPU Tests
|
||||
runs-on:
|
||||
group: gcp-ct5lp-hightpu-8t
|
||||
runs-on: docker-tpu
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache defaults:
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
|
||||
@@ -112,9 +112,9 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l
|
||||
| **Documentation** | **What can I learn?** |
|
||||
|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. |
|
||||
| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
|
||||
| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/overview_techniques) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
|
||||
| [Optimization](https://huggingface.co/docs/diffusers/optimization/fp16) | Guides for how to optimize your diffusion model to run faster and consume less memory. |
|
||||
| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
|
||||
| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
|
||||
| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview) | Guides for how to optimize your diffusion model to run faster and consume less memory. |
|
||||
| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. |
|
||||
## Contribution
|
||||
|
||||
|
||||
@@ -252,8 +252,6 @@
|
||||
title: SD3ControlNetModel
|
||||
- local: api/models/controlnet_sparsectrl
|
||||
title: SparseControlNetModel
|
||||
- local: api/models/controlnet_union
|
||||
title: ControlNetUnionModel
|
||||
title: ControlNets
|
||||
- sections:
|
||||
- local: api/models/allegro_transformer3d
|
||||
@@ -274,8 +272,6 @@
|
||||
title: LatteTransformer3DModel
|
||||
- local: api/models/lumina_nextdit2d
|
||||
title: LuminaNextDiT2DModel
|
||||
- local: api/models/ltx_video_transformer3d
|
||||
title: LTXVideoTransformer3DModel
|
||||
- local: api/models/mochi_transformer3d
|
||||
title: MochiTransformer3DModel
|
||||
- local: api/models/pixart_transformer2d
|
||||
@@ -284,8 +280,6 @@
|
||||
title: PriorTransformer
|
||||
- local: api/models/sd3_transformer2d
|
||||
title: SD3Transformer2DModel
|
||||
- local: api/models/sana_transformer2d
|
||||
title: SanaTransformer2DModel
|
||||
- local: api/models/stable_audio_transformer
|
||||
title: StableAudioDiTModel
|
||||
- local: api/models/transformer2d
|
||||
@@ -316,14 +310,10 @@
|
||||
title: AutoencoderKLAllegro
|
||||
- local: api/models/autoencoderkl_cogvideox
|
||||
title: AutoencoderKLCogVideoX
|
||||
- local: api/models/autoencoderkl_ltx_video
|
||||
title: AutoencoderKLLTXVideo
|
||||
- local: api/models/autoencoderkl_mochi
|
||||
title: AutoencoderKLMochi
|
||||
- local: api/models/asymmetricautoencoderkl
|
||||
title: AsymmetricAutoencoderKL
|
||||
- local: api/models/autoencoder_dc
|
||||
title: AutoencoderDC
|
||||
- local: api/models/consistency_decoder_vae
|
||||
title: ConsistencyDecoderVAE
|
||||
- local: api/models/autoencoder_oobleck
|
||||
@@ -376,8 +366,6 @@
|
||||
title: ControlNet-XS
|
||||
- local: api/pipelines/controlnetxs_sdxl
|
||||
title: ControlNet-XS with Stable Diffusion XL
|
||||
- local: api/pipelines/controlnet_union
|
||||
title: ControlNetUnion
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: Dance Diffusion
|
||||
- local: api/pipelines/ddim
|
||||
@@ -414,8 +402,6 @@
|
||||
title: Latte
|
||||
- local: api/pipelines/ledits_pp
|
||||
title: LEDITS++
|
||||
- local: api/pipelines/ltx_video
|
||||
title: LTX
|
||||
- local: api/pipelines/lumina
|
||||
title: Lumina-T2X
|
||||
- local: api/pipelines/marigold
|
||||
@@ -436,8 +422,6 @@
|
||||
title: PixArt-α
|
||||
- local: api/pipelines/pixart_sigma
|
||||
title: PixArt-Σ
|
||||
- local: api/pipelines/sana
|
||||
title: Sana
|
||||
- local: api/pipelines/self_attention_guidance
|
||||
title: Self-Attention Guidance
|
||||
- local: api/pipelines/semantic_stable_diffusion
|
||||
|
||||
@@ -17,9 +17,6 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
|
||||
- [`StableDiffusionLoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model.
|
||||
- [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`StableDiffusionLoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model.
|
||||
- [`SD3LoraLoaderMixin`] provides similar functions for [Stable Diffusion 3](https://huggingface.co/blog/sd3).
|
||||
- [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
|
||||
- [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
|
||||
- [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
|
||||
- [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
|
||||
- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
|
||||
|
||||
@@ -41,18 +38,6 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.SD3LoraLoaderMixin
|
||||
|
||||
## FluxLoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.FluxLoraLoaderMixin
|
||||
|
||||
## CogVideoXLoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.CogVideoXLoraLoaderMixin
|
||||
|
||||
## Mochi1LoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
|
||||
|
||||
## AmusedLoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# AutoencoderDC
|
||||
|
||||
The 2D Autoencoder model used in [SANA](https://huggingface.co/papers/2410.10629) and introduced in [DCAE](https://huggingface.co/papers/2410.10733) by authors Junyu Chen\*, Han Cai\*, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han from MIT HAN Lab.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder models for accelerating high-resolution diffusion models. Existing autoencoder models have demonstrated impressive results at a moderate spatial compression ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for high spatial compression ratios (e.g., 64x). We address this challenge by introducing two key techniques: (1) Residual Autoencoding, where we design our models to learn residuals based on the space-to-channel transformed features to alleviate the optimization difficulty of high spatial-compression autoencoders; (2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases training strategy for mitigating the generalization penalty of high spatial-compression autoencoders. With these designs, we improve the autoencoder's spatial compression ratio up to 128 while maintaining the reconstruction quality. Applying our DC-AE to latent diffusion models, we achieve significant speedup without accuracy drop. For example, on ImageNet 512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup on H100 GPU for UViT-H while achieving a better FID, compared with the widely used SD-VAE-f8 autoencoder. Our code is available at [this https URL](https://github.com/mit-han-lab/efficientvit).*
|
||||
|
||||
The following DCAE models are released and supported in Diffusers.
|
||||
|
||||
| Diffusers format | Original format |
|
||||
|:----------------:|:---------------:|
|
||||
| [`mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-sana-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0)
|
||||
| [`mit-han-lab/dc-ae-f32c32-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-in-1.0)
|
||||
| [`mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-mix-1.0)
|
||||
| [`mit-han-lab/dc-ae-f64c128-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f64c128-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-in-1.0)
|
||||
| [`mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f64c128-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-mix-1.0)
|
||||
| [`mit-han-lab/dc-ae-f128c512-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f128c512-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0)
|
||||
| [`mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f128c512-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-mix-1.0)
|
||||
|
||||
Load a model in Diffusers format with [`~ModelMixin.from_pretrained`].
|
||||
|
||||
```python
|
||||
from diffusers import AutoencoderDC
|
||||
|
||||
ae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", torch_dtype=torch.float32).to("cuda")
|
||||
```
|
||||
|
||||
## Load a model in Diffusers via `from_single_file`
|
||||
|
||||
```python
|
||||
from difusers import AutoencoderDC
|
||||
|
||||
ckpt_path = "https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0/blob/main/model.safetensors"
|
||||
model = AutoencoderDC.from_single_file(ckpt_path)
|
||||
|
||||
```
|
||||
|
||||
The `AutoencoderDC` model has `in` and `mix` single file checkpoint variants that have matching checkpoint keys, but use different scaling factors. It is not possible for Diffusers to automatically infer the correct config file to use with the model based on just the checkpoint and will default to configuring the model using the `mix` variant config file. To override the automatically determined config, please use the `config` argument when using single file loading with `in` variant checkpoints.
|
||||
|
||||
```python
|
||||
from diffusers import AutoencoderDC
|
||||
|
||||
ckpt_path = "https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0/blob/main/model.safetensors"
|
||||
model = AutoencoderDC.from_single_file(ckpt_path, config="mit-han-lab/dc-ae-f128c512-in-1.0-diffusers")
|
||||
```
|
||||
|
||||
|
||||
## AutoencoderDC
|
||||
|
||||
[[autodoc]] AutoencoderDC
|
||||
- encode
|
||||
- decode
|
||||
- all
|
||||
|
||||
## DecoderOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.vae.DecoderOutput
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# AutoencoderKLLTXVideo
|
||||
|
||||
The 3D variational autoencoder (VAE) model with KL loss used in [LTX](https://huggingface.co/Lightricks/LTX-Video) was introduced by Lightricks.
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import AutoencoderKLLTXVideo
|
||||
|
||||
vae = AutoencoderKLLTXVideo.from_pretrained("TODO/TODO", subfolder="vae", torch_dtype=torch.float32).to("cuda")
|
||||
```
|
||||
|
||||
## AutoencoderKLLTXVideo
|
||||
|
||||
[[autodoc]] AutoencoderKLLTXVideo
|
||||
- decode
|
||||
- encode
|
||||
- all
|
||||
|
||||
## AutoencoderKLOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
|
||||
|
||||
## DecoderOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.vae.DecoderOutput
|
||||
@@ -1,35 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNetUnionModel
|
||||
|
||||
ControlNetUnionModel is an implementation of ControlNet for Stable Diffusion XL.
|
||||
|
||||
The ControlNet model was introduced in [ControlNetPlus](https://github.com/xinsir6/ControlNetPlus) by xinsir6. It supports multiple conditioning inputs without increasing computation.
|
||||
|
||||
*We design a new architecture that can support 10+ control types in condition text-to-image generation and can generate high resolution images visually comparable with midjourney. The network is based on the original ControlNet architecture, we propose two new modules to: 1 Extend the original ControlNet to support different image conditions using the same network parameter. 2 Support multiple conditions input without increasing computation offload, which is especially important for designers who want to edit image in detail, different conditions use the same condition encoder, without adding extra computations or parameters.*
|
||||
|
||||
## Loading
|
||||
|
||||
By default the [`ControlNetUnionModel`] should be loaded with [`~ModelMixin.from_pretrained`].
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLControlNetUnionPipeline, ControlNetUnionModel
|
||||
|
||||
controlnet = ControlNetUnionModel.from_pretrained("xinsir/controlnet-union-sdxl-1.0")
|
||||
pipe = StableDiffusionXLControlNetUnionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet)
|
||||
```
|
||||
|
||||
## ControlNetUnionModel
|
||||
|
||||
[[autodoc]] ControlNetUnionModel
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# LTXVideoTransformer3DModel
|
||||
|
||||
A Diffusion Transformer model for 3D data from [LTX](https://huggingface.co/Lightricks/LTX-Video) was introduced by Lightricks.
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import LTXVideoTransformer3DModel
|
||||
|
||||
transformer = LTXVideoTransformer3DModel.from_pretrained("TODO/TODO", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
|
||||
```
|
||||
|
||||
## LTXVideoTransformer3DModel
|
||||
|
||||
[[autodoc]] LTXVideoTransformer3DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -1,34 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# SanaTransformer2DModel
|
||||
|
||||
A Diffusion Transformer model for 2D data from [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://huggingface.co/papers/2410.10629) was introduced from NVIDIA and MIT HAN Lab, by Enze Xie, Junsong Chen, Junyu Chen, Han Cai, Haotian Tang, Yujun Lin, Zhekai Zhang, Muyang Li, Ligeng Zhu, Yao Lu, Song Han.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096×4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4) Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024×1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released.*
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import SanaTransformer2DModel
|
||||
|
||||
transformer = SanaTransformer2DModel.from_pretrained("Efficient-Large-Model/Sana_1600M_1024px_diffusers", subfolder="transformer", torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
## SanaTransformer2DModel
|
||||
|
||||
[[autodoc]] SanaTransformer2DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -30,17 +30,15 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m
|
||||
This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
|
||||
|
||||
There are three official CogVideoX checkpoints for text-to-video and video-to-video.
|
||||
|
||||
| checkpoints | recommended inference dtype |
|
||||
|:---:|:---:|
|
||||
|---|---|
|
||||
| [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b) | torch.float16 |
|
||||
| [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b) | torch.bfloat16 |
|
||||
| [`THUDM/CogVideoX1.5-5b`](https://huggingface.co/THUDM/CogVideoX1.5-5b) | torch.bfloat16 |
|
||||
|
||||
There are two official CogVideoX checkpoints available for image-to-video.
|
||||
|
||||
| checkpoints | recommended inference dtype |
|
||||
|:---:|:---:|
|
||||
|---|---|
|
||||
| [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V) | torch.bfloat16 |
|
||||
| [`THUDM/CogVideoX-1.5-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-1.5-5b-I2V) | torch.bfloat16 |
|
||||
|
||||
@@ -50,9 +48,8 @@ For the CogVideoX 1.5 series:
|
||||
- Both T2V and I2V models support generation with 81 and 161 frames and work best at this value. Exporting videos at 16 FPS is recommended.
|
||||
|
||||
There are two official CogVideoX checkpoints that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team).
|
||||
|
||||
| checkpoints | recommended inference dtype |
|
||||
|:---:|:---:|
|
||||
|---|---|
|
||||
| [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose) | torch.bfloat16 |
|
||||
| [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose) | torch.bfloat16 |
|
||||
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNetUnion
|
||||
|
||||
ControlNetUnionModel is an implementation of ControlNet for Stable Diffusion XL.
|
||||
|
||||
The ControlNet model was introduced in [ControlNetPlus](https://github.com/xinsir6/ControlNetPlus) by xinsir6. It supports multiple conditioning inputs without increasing computation.
|
||||
|
||||
*We design a new architecture that can support 10+ control types in condition text-to-image generation and can generate high resolution images visually comparable with midjourney. The network is based on the original ControlNet architecture, we propose two new modules to: 1 Extend the original ControlNet to support different image conditions using the same network parameter. 2 Support multiple conditions input without increasing computation offload, which is especially important for designers who want to edit image in detail, different conditions use the same condition encoder, without adding extra computations or parameters.*
|
||||
|
||||
|
||||
## StableDiffusionXLControlNetUnionPipeline
|
||||
[[autodoc]] StableDiffusionXLControlNetUnionPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionXLControlNetUnionImg2ImgPipeline
|
||||
[[autodoc]] StableDiffusionXLControlNetUnionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionXLControlNetUnionInpaintPipeline
|
||||
[[autodoc]] StableDiffusionXLControlNetUnionInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -22,20 +22,12 @@ Flux can be quite expensive to run on consumer hardware devices. However, you ca
|
||||
|
||||
</Tip>
|
||||
|
||||
Flux comes in the following variants:
|
||||
Flux comes in two variants:
|
||||
|
||||
| model type | model id |
|
||||
|:----------:|:--------:|
|
||||
| Timestep-distilled | [`black-forest-labs/FLUX.1-schnell`](https://huggingface.co/black-forest-labs/FLUX.1-schnell) |
|
||||
| Guidance-distilled | [`black-forest-labs/FLUX.1-dev`](https://huggingface.co/black-forest-labs/FLUX.1-dev) |
|
||||
| Fill Inpainting/Outpainting (Guidance-distilled) | [`black-forest-labs/FLUX.1-Fill-dev`](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev) |
|
||||
| Canny Control (Guidance-distilled) | [`black-forest-labs/FLUX.1-Canny-dev`](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) |
|
||||
| Depth Control (Guidance-distilled) | [`black-forest-labs/FLUX.1-Depth-dev`](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev) |
|
||||
| Canny Control (LoRA) | [`black-forest-labs/FLUX.1-Canny-dev-lora`](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora) |
|
||||
| Depth Control (LoRA) | [`black-forest-labs/FLUX.1-Depth-dev-lora`](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora) |
|
||||
| Redux (Adapter) | [`black-forest-labs/FLUX.1-Redux-dev`](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev) |
|
||||
* Timestep-distilled (`black-forest-labs/FLUX.1-schnell`)
|
||||
* Guidance-distilled (`black-forest-labs/FLUX.1-dev`)
|
||||
|
||||
All checkpoints have different usage which we detail below.
|
||||
Both checkpoints have slightly difference usage which we detail below.
|
||||
|
||||
### Timestep-distilled
|
||||
|
||||
@@ -85,191 +77,7 @@ out = pipe(
|
||||
out.save("image.png")
|
||||
```
|
||||
|
||||
### Fill Inpainting/Outpainting
|
||||
|
||||
* Flux Fill pipeline does not require `strength` as an input like regular inpainting pipelines.
|
||||
* It supports both inpainting and outpainting.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import FluxFillPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup.png")
|
||||
mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup_mask.png")
|
||||
|
||||
repo_id = "black-forest-labs/FLUX.1-Fill-dev"
|
||||
pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to("cuda")
|
||||
|
||||
image = pipe(
|
||||
prompt="a white paper cup",
|
||||
image=image,
|
||||
mask_image=mask,
|
||||
height=1632,
|
||||
width=1232,
|
||||
max_sequence_length=512,
|
||||
generator=torch.Generator("cpu").manual_seed(0)
|
||||
).images[0]
|
||||
image.save(f"output.png")
|
||||
```
|
||||
|
||||
### Canny Control
|
||||
|
||||
**Note:** `black-forest-labs/Flux.1-Canny-dev` is _not_ a [`ControlNetModel`] model. ControlNet models are a separate component from the UNet/Transformer whose residuals are added to the actual underlying model. Canny Control is an alternate architecture that achieves effectively the same results as a ControlNet model would, by using channel-wise concatenation with input control condition and ensuring the transformer learns structure control by following the condition as closely as possible.
|
||||
|
||||
```python
|
||||
# !pip install -U controlnet-aux
|
||||
import torch
|
||||
from controlnet_aux import CannyDetector
|
||||
from diffusers import FluxControlPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to("cuda")
|
||||
|
||||
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
|
||||
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
|
||||
|
||||
processor = CannyDetector()
|
||||
control_image = processor(control_image, low_threshold=50, high_threshold=200, detect_resolution=1024, image_resolution=1024)
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
control_image=control_image,
|
||||
height=1024,
|
||||
width=1024,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=30.0,
|
||||
).images[0]
|
||||
image.save("output.png")
|
||||
```
|
||||
|
||||
Canny Control is also possible with a LoRA variant of this condition. The usage is as follows:
|
||||
|
||||
```python
|
||||
# !pip install -U controlnet-aux
|
||||
import torch
|
||||
from controlnet_aux import CannyDetector
|
||||
from diffusers import FluxControlPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
|
||||
pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
|
||||
|
||||
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
|
||||
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
|
||||
|
||||
processor = CannyDetector()
|
||||
control_image = processor(control_image, low_threshold=50, high_threshold=200, detect_resolution=1024, image_resolution=1024)
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
control_image=control_image,
|
||||
height=1024,
|
||||
width=1024,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=30.0,
|
||||
).images[0]
|
||||
image.save("output.png")
|
||||
```
|
||||
|
||||
### Depth Control
|
||||
|
||||
**Note:** `black-forest-labs/Flux.1-Depth-dev` is _not_ a ControlNet model. [`ControlNetModel`] models are a separate component from the UNet/Transformer whose residuals are added to the actual underlying model. Depth Control is an alternate architecture that achieves effectively the same results as a ControlNet model would, by using channel-wise concatenation with input control condition and ensuring the transformer learns structure control by following the condition as closely as possible.
|
||||
|
||||
```python
|
||||
# !pip install git+https://github.com/huggingface/image_gen_aux
|
||||
import torch
|
||||
from diffusers import FluxControlPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils import load_image
|
||||
from image_gen_aux import DepthPreprocessor
|
||||
|
||||
pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to("cuda")
|
||||
|
||||
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
|
||||
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
|
||||
|
||||
processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
|
||||
control_image = processor(control_image)[0].convert("RGB")
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
control_image=control_image,
|
||||
height=1024,
|
||||
width=1024,
|
||||
num_inference_steps=30,
|
||||
guidance_scale=10.0,
|
||||
generator=torch.Generator().manual_seed(42),
|
||||
).images[0]
|
||||
image.save("output.png")
|
||||
```
|
||||
|
||||
Depth Control is also possible with a LoRA variant of this condition. The usage is as follows:
|
||||
|
||||
```python
|
||||
# !pip install git+https://github.com/huggingface/image_gen_aux
|
||||
import torch
|
||||
from diffusers import FluxControlPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils import load_image
|
||||
from image_gen_aux import DepthPreprocessor
|
||||
|
||||
pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
|
||||
pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora")
|
||||
|
||||
prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
|
||||
control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
|
||||
|
||||
processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
|
||||
control_image = processor(control_image)[0].convert("RGB")
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
control_image=control_image,
|
||||
height=1024,
|
||||
width=1024,
|
||||
num_inference_steps=30,
|
||||
guidance_scale=10.0,
|
||||
generator=torch.Generator().manual_seed(42),
|
||||
).images[0]
|
||||
image.save("output.png")
|
||||
```
|
||||
|
||||
### Redux
|
||||
|
||||
* Flux Redux pipeline is an adapter for FLUX.1 base models. It can be used with both flux-dev and flux-schnell, for image-to-image generation.
|
||||
* You can first use the `FluxPriorReduxPipeline` to get the `prompt_embeds` and `pooled_prompt_embeds`, and then feed them into the `FluxPipeline` for image-to-image generation.
|
||||
* When use `FluxPriorReduxPipeline` with a base pipeline, you can set `text_encoder=None` and `text_encoder_2=None` in the base pipeline, in order to save VRAM.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import FluxPriorReduxPipeline, FluxPipeline
|
||||
from diffusers.utils import load_image
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
|
||||
repo_redux = "black-forest-labs/FLUX.1-Redux-dev"
|
||||
repo_base = "black-forest-labs/FLUX.1-dev"
|
||||
pipe_prior_redux = FluxPriorReduxPipeline.from_pretrained(repo_redux, torch_dtype=dtype).to(device)
|
||||
pipe = FluxPipeline.from_pretrained(
|
||||
repo_base,
|
||||
text_encoder=None,
|
||||
text_encoder_2=None,
|
||||
torch_dtype=torch.bfloat16
|
||||
).to(device)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy/img5.png")
|
||||
pipe_prior_output = pipe_prior_redux(image)
|
||||
images = pipe(
|
||||
guidance_scale=2.5,
|
||||
num_inference_steps=50,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
**pipe_prior_output,
|
||||
).images
|
||||
images[0].save("flux-redux.png")
|
||||
```
|
||||
|
||||
## Running FP16 inference
|
||||
|
||||
Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
|
||||
|
||||
FP16 inference code:
|
||||
@@ -380,27 +188,3 @@ image.save("flux-fp8-dev.png")
|
||||
[[autodoc]] FluxControlNetImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxControlPipeline
|
||||
|
||||
[[autodoc]] FluxControlPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxControlImg2ImgPipeline
|
||||
|
||||
[[autodoc]] FluxControlImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxPriorReduxPipeline
|
||||
|
||||
[[autodoc]] FluxPriorReduxPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxFillPipeline
|
||||
|
||||
[[autodoc]] FluxFillPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. -->
|
||||
|
||||
# LTX
|
||||
|
||||
[LTX Video](https://huggingface.co/Lightricks/LTX-Video) is the first DiT-based video generation model capable of generating high-quality videos in real-time. It produces 24 FPS videos at a 768x512 resolution faster than they can be watched. Trained on a large-scale dataset of diverse videos, the model generates high-resolution videos with realistic and varied content. We provide a model for both text-to-video as well as image + text-to-video usecases.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Loading Single Files
|
||||
|
||||
Loading the original LTX Video checkpoints is also possible with [`~ModelMixin.from_single_file`].
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoencoderKLLTXVideo, LTXImageToVideoPipeline, LTXVideoTransformer3DModel
|
||||
|
||||
single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
|
||||
transformer = LTXVideoTransformer3DModel.from_single_file(single_file_url, torch_dtype=torch.bfloat16)
|
||||
vae = AutoencoderKLLTXVideo.from_single_file(single_file_url, torch_dtype=torch.bfloat16)
|
||||
pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, vae=vae, torch_dtype=torch.bfloat16)
|
||||
|
||||
# ... inference code ...
|
||||
```
|
||||
|
||||
Alternatively, the pipeline can be used to load the weights with [~FromSingleFileMixin.from_single_file`].
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import LTXImageToVideoPipeline
|
||||
from transformers import T5EncoderModel, T5Tokenizer
|
||||
|
||||
single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
|
||||
text_encoder = T5EncoderModel.from_pretrained("Lightricks/LTX-Video", subfolder="text_encoder", torch_dtype=torch.bfloat16)
|
||||
tokenizer = T5Tokenizer.from_pretrained("Lightricks/LTX-Video", subfolder="tokenizer", torch_dtype=torch.bfloat16)
|
||||
pipe = LTXImageToVideoPipeline.from_single_file(single_file_url, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## LTXPipeline
|
||||
|
||||
[[autodoc]] LTXPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## LTXImageToVideoPipeline
|
||||
|
||||
[[autodoc]] LTXImageToVideoPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## LTXPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.ltx.pipeline_output.LTXPipelineOutput
|
||||
@@ -48,11 +48,6 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPAGInpaintPipeline
|
||||
[[autodoc]] StableDiffusionPAGInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPAGPipeline
|
||||
[[autodoc]] StableDiffusionPAGPipeline
|
||||
- all
|
||||
@@ -101,10 +96,6 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusion3PAGImg2ImgPipeline
|
||||
[[autodoc]] StableDiffusion3PAGImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## PixArtSigmaPAGPipeline
|
||||
[[autodoc]] PixArtSigmaPAGPipeline
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. -->
|
||||
|
||||
# SanaPipeline
|
||||
|
||||
[SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://huggingface.co/papers/2410.10629) from NVIDIA and MIT HAN Lab, by Enze Xie, Junsong Chen, Junyu Chen, Han Cai, Haotian Tang, Yujun Lin, Zhekai Zhang, Muyang Li, Ligeng Zhu, Yao Lu, Song Han.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096×4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4) Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024×1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released.*
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model]https://huggingface.co/Efficient-Large-Model).
|
||||
|
||||
Available models:
|
||||
|
||||
| Model | Recommended dtype |
|
||||
|:-----:|:-----------------:|
|
||||
| [`Efficient-Large-Model/Sana_1600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_diffusers) | `torch.float16` |
|
||||
| [`Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers) | `torch.float16` |
|
||||
| [`Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers) | `torch.bfloat16` |
|
||||
| [`Efficient-Large-Model/Sana_1600M_512px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_diffusers) | `torch.float16` |
|
||||
| [`Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers) | `torch.float16` |
|
||||
| [`Efficient-Large-Model/Sana_600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px_diffusers) | `torch.float16` |
|
||||
| [`Efficient-Large-Model/Sana_600M_512px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px_diffusers) | `torch.float16` |
|
||||
|
||||
Refer to [this](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) collection for more information.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to pass the `variant` argument for downloaded checkpoints to use lower disk space. Set it to `"fp16"` for models with recommended dtype as `torch.float16`, and `"bf16"` for models with recommended dtype as `torch.bfloat16`. By default, `torch.float32` weights are downloaded, which use twice the amount of disk storage. Additionally, `torch.float32` weights can be downcasted on-the-fly by specifying the `torch_dtype` argument. Read about it in the [docs](https://huggingface.co/docs/diffusers/v0.31.0/en/api/pipelines/overview#diffusers.DiffusionPipeline.from_pretrained).
|
||||
|
||||
</Tip>
|
||||
|
||||
## SanaPipeline
|
||||
|
||||
[[autodoc]] SanaPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SanaPAGPipeline
|
||||
|
||||
[[autodoc]] SanaPAGPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SanaPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.sana.pipeline_output.SanaPipelineOutput
|
||||
@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/
|
||||
|
||||
```python
|
||||
model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda")
|
||||
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)
|
||||
|
||||
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
|
||||
```
|
||||
@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
|
||||
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
|
||||
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
).to(device)
|
||||
```
|
||||
|
||||
Now, we perform the edits:
|
||||
@@ -326,9 +326,9 @@ from transformers import (
|
||||
|
||||
clip_id = "openai/clip-vit-large-patch14"
|
||||
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
|
||||
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
|
||||
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
```
|
||||
|
||||
Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip).
|
||||
@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module):
|
||||
|
||||
def preprocess_image(self, image):
|
||||
image = self.image_processor(image, return_tensors="pt")["pixel_values"]
|
||||
return {"pixel_values": image.to("cuda")}
|
||||
return {"pixel_values": image.to(device)}
|
||||
|
||||
def tokenize_text(self, text):
|
||||
inputs = self.tokenizer(
|
||||
@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module):
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
return {"input_ids": inputs.input_ids.to("cuda")}
|
||||
return {"input_ids": inputs.input_ids.to(device)}
|
||||
|
||||
def encode_image(self, image):
|
||||
preprocessed_image = self.preprocess_image(image)
|
||||
@@ -459,7 +459,6 @@ with ZipFile(local_filepath, "r") as zipper:
|
||||
```python
|
||||
from PIL import Image
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
dataset_path = "sample-imagenet-images"
|
||||
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
|
||||
@@ -478,7 +477,6 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t
|
||||
|
||||
```python
|
||||
from torchvision.transforms import functional as F
|
||||
import torch
|
||||
|
||||
|
||||
def preprocess_image(image):
|
||||
@@ -500,10 +498,6 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=
|
||||
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
|
||||
dit_pipeline = dit_pipeline.to("cuda")
|
||||
|
||||
seed = 0
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
|
||||
words = [
|
||||
"cassette player",
|
||||
"chainsaw",
|
||||
|
||||
@@ -17,12 +17,6 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
|
||||
|
||||
This guide demonstrates how quantization can enable running
|
||||
[FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)
|
||||
on less than 16GB of VRAM and even on a free Google
|
||||
Colab instance.
|
||||
|
||||

|
||||
|
||||
To use bitsandbytes, make sure you have the following libraries installed:
|
||||
|
||||
@@ -37,167 +31,70 @@ Now you can quantize a model by passing a [`BitsAndBytesConfig`] to [`~ModelMixi
|
||||
|
||||
Quantizing a model in 8-bit halves the memory-usage:
|
||||
|
||||
bitsandbytes is supported in both Transformers and Diffusers, so you can quantize both the
|
||||
[`FluxTransformer2DModel`] and [`~transformers.T5EncoderModel`].
|
||||
```py
|
||||
from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
|
||||
|
||||
For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bfloat16`.
|
||||
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
|
||||
> [!TIP]
|
||||
> The [`CLIPTextModel`] and [`AutoencoderKL`] aren't quantized because they're already small in size and because [`AutoencoderKL`] only has a few `torch.nn.Linear` layers.
|
||||
model_8bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
```
|
||||
|
||||
By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
|
||||
|
||||
```py
|
||||
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
|
||||
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
|
||||
from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
|
||||
|
||||
from diffusers import FluxTransformer2DModel
|
||||
from transformers import T5EncoderModel
|
||||
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
|
||||
quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
|
||||
|
||||
text_encoder_2_8bit = T5EncoderModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="text_encoder_2",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True,)
|
||||
|
||||
transformer_8bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
model_8bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
quantization_config=quantization_config,
|
||||
torch_dtype=torch.float32
|
||||
)
|
||||
model_8bit.transformer_blocks.layers[-1].norm2.weight.dtype
|
||||
```
|
||||
|
||||
By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
|
||||
|
||||
```diff
|
||||
transformer_8bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
+ torch_dtype=torch.float32,
|
||||
)
|
||||
```
|
||||
|
||||
Let's generate an image using our quantized models.
|
||||
|
||||
Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the
|
||||
CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.
|
||||
|
||||
```py
|
||||
pipe = FluxPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
transformer=transformer_8bit,
|
||||
text_encoder_2=text_encoder_2_8bit,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
pipe_kwargs = {
|
||||
"prompt": "A cat holding a sign that says hello world",
|
||||
"height": 1024,
|
||||
"width": 1024,
|
||||
"guidance_scale": 3.5,
|
||||
"num_inference_steps": 50,
|
||||
"max_sequence_length": 512,
|
||||
}
|
||||
|
||||
image = pipe(**pipe_kwargs, generator=torch.manual_seed(0),).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/quant-bnb/8bit.png"/>
|
||||
</div>
|
||||
|
||||
When there is enough memory, you can also directly move the pipeline to the GPU with `.to("cuda")` and apply [`~DiffusionPipeline.enable_model_cpu_offload`] to optimize GPU memory usage.
|
||||
|
||||
Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 8-bit models locally with [`~ModelMixin.save_pretrained`].
|
||||
Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="4-bit">
|
||||
|
||||
Quantizing a model in 4-bit reduces your memory-usage by 4x:
|
||||
|
||||
bitsandbytes is supported in both Transformers and Diffusers, so you can can quantize both the
|
||||
[`FluxTransformer2DModel`] and [`~transformers.T5EncoderModel`].
|
||||
```py
|
||||
from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
|
||||
|
||||
For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bfloat16`.
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
|
||||
> [!TIP]
|
||||
> The [`CLIPTextModel`] and [`AutoencoderKL`] aren't quantized because they're already small in size and because [`AutoencoderKL`] only has a few `torch.nn.Linear` layers.
|
||||
model_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
```
|
||||
|
||||
By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
|
||||
|
||||
```py
|
||||
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
|
||||
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
|
||||
from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
|
||||
|
||||
from diffusers import FluxTransformer2DModel
|
||||
from transformers import T5EncoderModel
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
|
||||
quant_config = TransformersBitsAndBytesConfig(load_in_4bit=True,)
|
||||
|
||||
text_encoder_2_4bit = T5EncoderModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="text_encoder_2",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,)
|
||||
|
||||
transformer_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
model_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
quantization_config=quantization_config,
|
||||
torch_dtype=torch.float32
|
||||
)
|
||||
model_4bit.transformer_blocks.layers[-1].norm2.weight.dtype
|
||||
```
|
||||
|
||||
By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
|
||||
|
||||
```diff
|
||||
transformer_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
+ torch_dtype=torch.float32,
|
||||
)
|
||||
```
|
||||
|
||||
Let's generate an image using our quantized models.
|
||||
|
||||
Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.
|
||||
|
||||
```py
|
||||
pipe = FluxPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
transformer=transformer_4bit,
|
||||
text_encoder_2=text_encoder_2_4bit,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
pipe_kwargs = {
|
||||
"prompt": "A cat holding a sign that says hello world",
|
||||
"height": 1024,
|
||||
"width": 1024,
|
||||
"guidance_scale": 3.5,
|
||||
"num_inference_steps": 50,
|
||||
"max_sequence_length": 512,
|
||||
}
|
||||
|
||||
image = pipe(**pipe_kwargs, generator=torch.manual_seed(0),).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/quant-bnb/4bit.png"/>
|
||||
</div>
|
||||
|
||||
When there is enough memory, you can also directly move the pipeline to the GPU with `.to("cuda")` and apply [`~DiffusionPipeline.enable_model_cpu_offload`] to optimize GPU memory usage.
|
||||
|
||||
Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
|
||||
Call [`~ModelMixin.push_to_hub`] after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
@@ -302,34 +199,17 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty
|
||||
NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
|
||||
|
||||
```py
|
||||
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
|
||||
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
|
||||
from diffusers import BitsAndBytesConfig
|
||||
|
||||
from diffusers import FluxTransformer2DModel
|
||||
from transformers import T5EncoderModel
|
||||
|
||||
quant_config = TransformersBitsAndBytesConfig(
|
||||
nf4_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
)
|
||||
|
||||
text_encoder_2_4bit = T5EncoderModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="text_encoder_2",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
quant_config = DiffusersBitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
)
|
||||
|
||||
transformer_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
model_nf4 = SD3Transformer2DModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
quantization_config=nf4_config,
|
||||
)
|
||||
```
|
||||
|
||||
@@ -340,74 +220,38 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa
|
||||
Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter.
|
||||
|
||||
```py
|
||||
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
|
||||
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
|
||||
from diffusers import BitsAndBytesConfig
|
||||
|
||||
from diffusers import FluxTransformer2DModel
|
||||
from transformers import T5EncoderModel
|
||||
|
||||
quant_config = TransformersBitsAndBytesConfig(
|
||||
double_quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
text_encoder_2_4bit = T5EncoderModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="text_encoder_2",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
quant_config = DiffusersBitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
transformer_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
double_quant_model = SD3Transformer2DModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
quantization_config=double_quant_config,
|
||||
)
|
||||
```
|
||||
|
||||
## Dequantizing `bitsandbytes` models
|
||||
|
||||
Once quantized, you can dequantize a model to its original precision, but this might result in a small loss of quality. Make sure you have enough GPU RAM to fit the dequantized model.
|
||||
Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model.
|
||||
|
||||
```python
|
||||
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
|
||||
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
|
||||
from diffusers import BitsAndBytesConfig
|
||||
|
||||
from diffusers import FluxTransformer2DModel
|
||||
from transformers import T5EncoderModel
|
||||
|
||||
quant_config = TransformersBitsAndBytesConfig(
|
||||
double_quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
text_encoder_2_4bit = T5EncoderModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="text_encoder_2",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
quant_config = DiffusersBitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
transformer_4bit = FluxTransformer2DModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
double_quant_model = SD3Transformer2DModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
subfolder="transformer",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.float16,
|
||||
quantization_config=double_quant_config,
|
||||
)
|
||||
|
||||
text_encoder_2_4bit.dequantize()
|
||||
transformer_4bit.dequantize()
|
||||
model.dequantize()
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Create a dataset for training
|
||||
|
||||
There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](https://huggingface.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation.
|
||||
There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](hf.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation.
|
||||
|
||||
This guide will show you two ways to create a dataset to finetune on:
|
||||
|
||||
@@ -87,4 +87,4 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
|
||||
|
||||
Now that you've created a dataset, you can plug it into the `train_data_dir` (if your dataset is local) or `dataset_name` (if your dataset is on the Hub) arguments of a training script.
|
||||
|
||||
For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)!
|
||||
For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)!
|
||||
@@ -75,7 +75,7 @@ For convenience, create a `TrainingConfig` class containing the training hyperpa
|
||||
|
||||
... push_to_hub = True # whether to upload the saved model to the HF Hub
|
||||
... hub_model_id = "<your-username>/<my-awesome-model>" # the name of the repository to create on the HF Hub
|
||||
... hub_private_repo = None
|
||||
... hub_private_repo = False
|
||||
... overwrite_output_dir = True # overwrite the old model when re-running the notebook
|
||||
... seed = 0
|
||||
|
||||
|
||||
@@ -134,16 +134,14 @@ The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads L
|
||||
- the LoRA weights don't have separate identifiers for the UNet and text encoder
|
||||
- the LoRA weights have separate identifiers for the UNet and text encoder
|
||||
|
||||
To directly load (and save) a LoRA adapter at the *model-level*, use [`~PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
|
||||
|
||||
Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.
|
||||
But if you only need to load LoRA weights into the UNet, then you can use the [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method. Let's load the [jbilcke-hf/sdxl-cinematic-1](https://huggingface.co/jbilcke-hf/sdxl-cinematic-1) LoRA:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.unet.load_lora_adapter("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", prefix="unet")
|
||||
pipeline.unet.load_attn_procs("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors")
|
||||
|
||||
# use cnmt in the prompt to trigger the LoRA
|
||||
prompt = "A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
@@ -155,8 +153,6 @@ image
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
|
||||
</div>
|
||||
|
||||
Save an adapter with [`~PeftAdapterMixin.save_lora_adapter`].
|
||||
|
||||
To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
|
||||
|
||||
```py
|
||||
|
||||
@@ -121,7 +121,7 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inferen
|
||||
|
||||
### 이미지 결과물을 정제하기
|
||||
|
||||
[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다.
|
||||
[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다.
|
||||
|
||||
refiner를 사용할 때, 쉽게 사용할 수 있습니다
|
||||
- 1.) base 모델과 refiner을 사용하는데, 이는 *Denoisers의 앙상블*을 위한 첫 번째 제안된 [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/)를 사용하거나
|
||||
@@ -215,7 +215,7 @@ image = refiner(
|
||||
|
||||
#### 2.) 노이즈가 완전히 제거된 기본 이미지에서 이미지 출력을 정제하기
|
||||
|
||||
일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다.
|
||||
일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다.
|
||||
|
||||
이를 위해, 보통의 "base" text-to-image 파이프라인을 수행 후에 image-to-image 파이프라인으로써 refiner를 실행시킬 수 있습니다. base 모델의 출력을 잠재 공간에 남겨둘 수 있습니다.
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# 학습을 위한 데이터셋 만들기
|
||||
|
||||
[Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) 에는 모델 교육을 위한 많은 데이터셋이 있지만,
|
||||
관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](https://huggingface.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다.
|
||||
관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](hf.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다.
|
||||
데이터셋 구조는 모델을 학습하려는 작업에 따라 달라집니다.
|
||||
가장 기본적인 데이터셋 구조는 unconditional 이미지 생성과 같은 작업을 위한 이미지 디렉토리입니다.
|
||||
또 다른 데이터셋 구조는 이미지 디렉토리와 text-to-image 생성과 같은 작업에 해당하는 텍스트 캡션이 포함된 텍스트 파일일 수 있습니다.
|
||||
|
||||
@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) 및 [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다.
|
||||
|
||||
모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](https://huggingface.co/join)하세요):
|
||||
모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](hf.co/join)하세요):
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
|
||||
@@ -76,7 +76,7 @@ huggingface-cli login
|
||||
... output_dir = "ddpm-butterflies-128" # 로컬 및 HF Hub에 저장되는 모델명
|
||||
|
||||
... push_to_hub = True # 저장된 모델을 HF Hub에 업로드할지 여부
|
||||
... hub_private_repo = None
|
||||
... hub_private_repo = False
|
||||
... overwrite_output_dir = True # 노트북을 다시 실행할 때 이전 모델에 덮어씌울지
|
||||
... seed = 0
|
||||
|
||||
|
||||
@@ -872,9 +872,10 @@ def prepare_rotary_positional_embeddings(
|
||||
crops_coords=grid_crops_coords,
|
||||
grid_size=(grid_height, grid_width),
|
||||
temporal_size=num_frames,
|
||||
device=device,
|
||||
)
|
||||
|
||||
freqs_cos = freqs_cos.to(device=device)
|
||||
freqs_sin = freqs_sin.to(device=device)
|
||||
return freqs_cos, freqs_sin
|
||||
|
||||
|
||||
|
||||
@@ -894,9 +894,10 @@ def prepare_rotary_positional_embeddings(
|
||||
crops_coords=grid_crops_coords,
|
||||
grid_size=(grid_height, grid_width),
|
||||
temporal_size=num_frames,
|
||||
device=device,
|
||||
)
|
||||
|
||||
freqs_cos = freqs_cos.to(device=device)
|
||||
freqs_sin = freqs_sin.to(device=device)
|
||||
return freqs_cos, freqs_sin
|
||||
|
||||
|
||||
|
||||
+22
-128
@@ -11,22 +11,22 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
|
||||
|Adaptive Mask Inpainting|Adaptive Mask Inpainting algorithm from [Beyond the Contact: Discovering Comprehensive Affordance for 3D Objects from Pre-trained 2D Diffusion Models](https://github.com/snuvclab/coma) (ECCV '24, Oral) provides a way to insert human inside the scene image without altering the background, by inpainting with adapting mask.|[Adaptive Mask Inpainting](#adaptive-mask-inpainting)|-|[Hyeonwoo Kim](https://sshowbiz.xyz),[Sookwan Han](https://jellyheadandrew.github.io)|
|
||||
|Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/flux_with_cfg.ipynb)|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)|
|
||||
|Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|NA|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)|
|
||||
|Differential Diffusion|[Differential Diffusion](https://github.com/exx8/differential-diffusion) modifies an image according to a text prompt, and according to a map that specifies the amount of change in each region.|[Differential Diffusion](#differential-diffusion)|[](https://huggingface.co/spaces/exx8/differential-diffusion) [](https://colab.research.google.com/github/exx8/differential-diffusion/blob/main/examples/SD2.ipynb)|[Eran Levin](https://github.com/exx8) and [Ohad Fried](https://www.ohadf.com/)|
|
||||
| HD-Painter | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method. | [HD-Painter](#hd-painter) | [](https://huggingface.co/spaces/PAIR/HD-Painter) | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) |
|
||||
| Marigold Monocular Depth Estimation | A universal monocular depth estimator, utilizing Stable Diffusion, delivering sharp predictions in the wild. (See the [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) for more details.) | [Marigold Depth Estimation](#marigold-depth-estimation) | [](https://huggingface.co/spaces/toshas/marigold) [](https://colab.research.google.com/drive/12G8reD13DdpMie5ZQlaFNo2WCGeNUH-u?usp=sharing) | [Bingxin Ke](https://github.com/markkua) and [Anton Obukhov](https://github.com/toshas) |
|
||||
| LLM-grounded Diffusion (LMD+) | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion) | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion) | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) [](https://colab.research.google.com/drive/1SXzMSeAB-LJYISb2yrUOdypLz4OYWUKj) | [Long (Tony) Lian](https://tonylian.com/) |
|
||||
| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) |
|
||||
| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see <https://github.com/huggingface/diffusers/issues/841>) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/stable_diffusion_interpolation.ipynb) | [Nate Raw](https://github.com/nateraw/) |
|
||||
| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
|
||||
| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/stable_diffusion_mega.ipynb) | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/long_prompt_weighting_stable_diffusion.ipynb) | [SkyTNT](https://github.com/SkyTNT) |
|
||||
| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | - | [SkyTNT](https://github.com/SkyTNT) |
|
||||
| Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/speech_to_image.ipynb) | [Mikail Duzenli](https://github.com/MikailINTech)
|
||||
| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/wildcard_stable_diffusion.ipynb) | [Shyam Sudhakaran](https://github.com/shyamsn97) |
|
||||
| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) |
|
||||
| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
| Seed Resizing Stable Diffusion | Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image | [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
| Multilingual Stable Diffusion | Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/multilingual_stable_diffusion.ipynb) | [Juan Carlos Piñeros](https://github.com/juancopi81) |
|
||||
| Multilingual Stable Diffusion | Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | - | [Juan Carlos Piñeros](https://github.com/juancopi81) |
|
||||
| GlueGen Stable Diffusion | Stable Diffusion Pipeline that supports prompts in different languages using GlueGen adapter. | [GlueGen Stable Diffusion](#gluegen-stable-diffusion-pipeline) | - | [Phạm Hồng Vinh](https://github.com/rootonchair) |
|
||||
| Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting | [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Alex McKinney](https://github.com/vvvm23) |
|
||||
| Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting | [Text Based Inpainting Stable Diffusion](#text-based-inpainting-stable-diffusion) | - | [Dhruv Karan](https://github.com/unography) |
|
||||
@@ -41,8 +41,8 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - | [Aengus (Duc-Anh)](https://github.com/aengusng8) |
|
||||
| CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) |
|
||||
| TensorRT Stable Diffusion Text to Image Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Text to Image Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
|
||||
| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/edict_image_pipeline.ipynb) | [Joqsan Azocar](https://github.com/Joqsan) |
|
||||
| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.09865) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint )|[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/stable_diffusion_repaint.ipynb)| [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
|
||||
| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) |
|
||||
| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.09865) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) |
|
||||
| TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) |
|
||||
| Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) |
|
||||
| CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) |
|
||||
@@ -67,7 +67,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| Rerender A Video Pipeline | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954) | [Rerender A Video Pipeline](#rerender-a-video) | - | [Yifan Zhou](https://github.com/SingleZombie) |
|
||||
| StyleAligned Pipeline | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133) | [StyleAligned Pipeline](#stylealigned-pipeline) | [](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
|
||||
| AnimateDiff Image-To-Video Pipeline | Experimental Image-To-Video support for AnimateDiff (open to improvements) | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
|
||||
| IP Adapter FaceID Stable Diffusion | Stable Diffusion Pipeline that supports IP Adapter Face ID | [IP Adapter Face ID](#ip-adapter-face-id) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/ip_adapter_face_id.ipynb)| [Fabio Rigano](https://github.com/fabiorigano) |
|
||||
| IP Adapter FaceID Stable Diffusion | Stable Diffusion Pipeline that supports IP Adapter Face ID | [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
|
||||
| InstantID Pipeline | Stable Diffusion XL Pipeline that supports InstantID | [InstantID Pipeline](#instantid-pipeline) | [](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
|
||||
| UFOGen Scheduler | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines) | [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
|
||||
| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
|
||||
@@ -251,30 +251,24 @@ Example usage:
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
model_name = "black-forest-labs/FLUX.1-dev"
|
||||
prompt = "a watercolor painting of a unicorn"
|
||||
negative_prompt = "pink"
|
||||
|
||||
# Load the diffusion pipeline
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
model_name,
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
torch_dtype=torch.bfloat16,
|
||||
custom_pipeline="pipeline_flux_with_cfg"
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
prompt = "a watercolor painting of a unicorn"
|
||||
negative_prompt = "pink"
|
||||
|
||||
# Generate the image
|
||||
img = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
true_cfg=1.5,
|
||||
guidance_scale=3.5,
|
||||
num_images_per_prompt=1,
|
||||
generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
|
||||
# Save the generated image
|
||||
img.save("cfg_flux.png")
|
||||
print("Image generated and saved successfully.")
|
||||
```
|
||||
|
||||
### Differential Diffusion
|
||||
@@ -847,8 +841,6 @@ out = pipe(
|
||||
wildcard_files=["object.txt", "animal.txt"],
|
||||
num_prompt_samples=1
|
||||
)
|
||||
out.images[0].save("image.png")
|
||||
torch.cuda.empty_cache()
|
||||
```
|
||||
|
||||
### Composable Stable diffusion
|
||||
@@ -2625,17 +2617,16 @@ for obj in range(bs):
|
||||
|
||||
### Stable Diffusion XL Reference
|
||||
|
||||
This pipeline uses the Reference. Refer to the [Stable Diffusion Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference) section for more information.
|
||||
This pipeline uses the Reference. Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference).
|
||||
|
||||
```py
|
||||
import torch
|
||||
# from diffusers import DiffusionPipeline
|
||||
from PIL import Image
|
||||
from diffusers.utils import load_image
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.schedulers import UniPCMultistepScheduler
|
||||
|
||||
from .stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
|
||||
|
||||
input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg")
|
||||
input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
|
||||
|
||||
# pipe = DiffusionPipeline.from_pretrained(
|
||||
# "stabilityai/stable-diffusion-xl-base-1.0",
|
||||
@@ -2653,7 +2644,7 @@ pipe = StableDiffusionXLReferencePipeline.from_pretrained(
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
result_img = pipe(ref_image=input_image,
|
||||
prompt="a dog",
|
||||
prompt="1girl",
|
||||
num_inference_steps=20,
|
||||
reference_attn=True,
|
||||
reference_adain=True).images[0]
|
||||
@@ -2661,14 +2652,14 @@ result_img = pipe(ref_image=input_image,
|
||||
|
||||
Reference Image
|
||||
|
||||

|
||||

|
||||
|
||||
Output Image
|
||||
|
||||
`prompt: a dog`
|
||||
`prompt: 1 girl`
|
||||
|
||||
`reference_attn=False, reference_adain=True, num_inference_steps=20`
|
||||

|
||||
`reference_attn=True, reference_adain=True, num_inference_steps=20`
|
||||

|
||||
|
||||
Reference Image
|
||||

|
||||
@@ -2690,88 +2681,6 @@ Output Image
|
||||
`reference_attn=True, reference_adain=True, num_inference_steps=20`
|
||||

|
||||
|
||||
### Stable Diffusion XL ControlNet Reference
|
||||
|
||||
This pipeline uses the Reference Control and with ControlNet. Refer to the [Stable Diffusion ControlNet Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-controlnet-reference) and [Stable Diffusion XL Reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-xl-reference) sections for more information.
|
||||
|
||||
```py
|
||||
from diffusers import ControlNetModel, AutoencoderKL
|
||||
from diffusers.schedulers import UniPCMultistepScheduler
|
||||
from diffusers.utils import load_image
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import cv2
|
||||
from PIL import Image
|
||||
|
||||
from .stable_diffusion_xl_controlnet_reference import StableDiffusionXLControlNetReferencePipeline
|
||||
|
||||
# download an image
|
||||
canny_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg"
|
||||
)
|
||||
|
||||
ref_image = load_image(
|
||||
"https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
|
||||
)
|
||||
|
||||
# initialize the models and pipeline
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
|
||||
)
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionXLControlNetReferencePipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
|
||||
).to("cuda:0")
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# get canny image
|
||||
image = np.array(canny_image)
|
||||
image = cv2.Canny(image, 100, 200)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
|
||||
# generate image
|
||||
image = pipe(
|
||||
prompt="a cat",
|
||||
num_inference_steps=20,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
image=canny_image,
|
||||
ref_image=ref_image,
|
||||
reference_attn=False,
|
||||
reference_adain=True,
|
||||
style_fidelity=1.0,
|
||||
generator=torch.Generator("cuda").manual_seed(42)
|
||||
).images[0]
|
||||
```
|
||||
|
||||
Canny ControlNet Image
|
||||
|
||||

|
||||
|
||||
Reference Image
|
||||
|
||||

|
||||
|
||||
Output Image
|
||||
|
||||
`prompt: a cat`
|
||||
|
||||
`reference_attn=True, reference_adain=True, num_inference_steps=20, style_fidelity=1.0`
|
||||
|
||||

|
||||
|
||||
`reference_attn=False, reference_adain=True, num_inference_steps=20, style_fidelity=1.0`
|
||||
|
||||

|
||||
|
||||
`reference_attn=True, reference_adain=False, num_inference_steps=20, style_fidelity=1.0`
|
||||
|
||||

|
||||
|
||||
### Stable diffusion fabric pipeline
|
||||
|
||||
FABRIC approach applicable to a wide range of popular diffusion models, which exploits
|
||||
@@ -3467,20 +3376,6 @@ best quality, 3persons in garden, a boy blue shirt BREAK
|
||||
best quality, 3persons in garden, an old man red suit
|
||||
```
|
||||
|
||||
### Use base prompt
|
||||
|
||||
You can use a base prompt to apply the prompt to all areas. You can set a base prompt by adding `ADDBASE` at the end. Base prompts can also be combined with common prompts, but the base prompt must be specified first.
|
||||
|
||||
```
|
||||
2d animation style ADDBASE
|
||||
masterpiece, high quality ADDCOMM
|
||||
(blue sky)++ BREAK
|
||||
green hair twintail BREAK
|
||||
book shelf BREAK
|
||||
messy desk BREAK
|
||||
orange++ dress and sofa
|
||||
```
|
||||
|
||||
### Negative prompt
|
||||
|
||||
Negative prompts are equally effective across all regions, but it is possible to set region-specific prompts for negative prompts as well. The number of BREAKs must be the same as the number of prompts. If the number of prompts does not match, the negative prompts will be used without being divided into regions.
|
||||
@@ -3511,7 +3406,6 @@ pipe(prompt=prompt, rp_args=rp_args)
|
||||
### Optional Parameters
|
||||
|
||||
- `save_mask`: In `Prompt` mode, choose whether to output the generated mask along with the image. The default is `False`.
|
||||
- `base_ratio`: Used with `ADDBASE`. Sets the ratio of the base prompt; if base ratio is set to 0.2, then resulting images will consist of `20%*BASE_PROMPT + 80%*REGION_PROMPT`
|
||||
|
||||
The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.
|
||||
|
||||
@@ -4800,4 +4694,4 @@ with torch.no_grad():
|
||||
```
|
||||
|
||||
In the folder examples/pixart there is also a script that can be used to train new models.
|
||||
Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.
|
||||
Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.
|
||||
@@ -6,9 +6,9 @@ If a community script doesn't work as expected, please open an issue and ping th
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
|
||||
| Using IP-Adapter with Negative Noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/ip_adapter_negative_noise.ipynb) | [Álvaro Somoza](https://github.com/asomoza)|
|
||||
| Asymmetric Tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#Asymmetric-Tiling ) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/asymetric_tiling.ipynb) | [alexisrolland](https://github.com/alexisrolland)|
|
||||
| Prompt Scheduling Callback |Allows changing prompts during a generation | [Prompt Scheduling-Callback](#Prompt-Scheduling-Callback ) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/prompt_scheduling_callback.ipynb) | [hlky](https://github.com/hlky)|
|
||||
| Using IP-Adapter with Negative Noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) | https://github.com/huggingface/notebooks/blob/main/diffusers/ip_adapter_negative_noise.ipynb | [Álvaro Somoza](https://github.com/asomoza)|
|
||||
| Asymmetric Tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#Asymmetric-Tiling ) |https://github.com/huggingface/notebooks/blob/main/diffusers/asymetric_tiling.ipynb | [alexisrolland](https://github.com/alexisrolland)|
|
||||
| Prompt Scheduling Callback |Allows changing prompts during a generation | [Prompt Scheduling-Callback](#Prompt-Scheduling-Callback ) |https://github.com/huggingface/notebooks/blob/main/diffusers/prompt_scheduling_callback.ipynb | [hlky](https://github.com/hlky)|
|
||||
|
||||
|
||||
## Example usages
|
||||
@@ -241,15 +241,27 @@ from diffusers import StableDiffusionPipeline
|
||||
from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
|
||||
from diffusers.configuration_utils import register_to_config
|
||||
import torch
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
class SDPromptSchedulingCallback(PipelineCallback):
|
||||
pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
pipeline.safety_checker = None
|
||||
pipeline.requires_safety_checker = False
|
||||
|
||||
|
||||
class SDPromptScheduleCallback(PipelineCallback):
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
|
||||
cutoff_step_ratio=None,
|
||||
prompt: str,
|
||||
negative_prompt: Optional[str] = None,
|
||||
num_images_per_prompt: int = 1,
|
||||
cutoff_step_ratio=1.0,
|
||||
cutoff_step_index=None,
|
||||
):
|
||||
super().__init__(
|
||||
@@ -263,10 +275,6 @@ class SDPromptSchedulingCallback(PipelineCallback):
|
||||
) -> Dict[str, Any]:
|
||||
cutoff_step_ratio = self.config.cutoff_step_ratio
|
||||
cutoff_step_index = self.config.cutoff_step_index
|
||||
if isinstance(self.config.encoded_prompt, tuple):
|
||||
prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
|
||||
else:
|
||||
prompt_embeds = self.config.encoded_prompt
|
||||
|
||||
# Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
|
||||
cutoff_step = (
|
||||
@@ -276,164 +284,34 @@ class SDPromptSchedulingCallback(PipelineCallback):
|
||||
)
|
||||
|
||||
if step_index == cutoff_step:
|
||||
prompt_embeds, negative_prompt_embeds = pipeline.encode_prompt(
|
||||
prompt=self.config.prompt,
|
||||
negative_prompt=self.config.negative_prompt,
|
||||
device=pipeline._execution_device,
|
||||
num_images_per_prompt=self.config.num_images_per_prompt,
|
||||
do_classifier_free_guidance=pipeline.do_classifier_free_guidance,
|
||||
)
|
||||
if pipeline.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
|
||||
return callback_kwargs
|
||||
|
||||
|
||||
pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
pipeline.safety_checker = None
|
||||
pipeline.requires_safety_checker = False
|
||||
|
||||
callback = MultiPipelineCallbacks(
|
||||
[
|
||||
SDPromptSchedulingCallback(
|
||||
encoded_prompt=pipeline.encode_prompt(
|
||||
prompt=f"prompt {index}",
|
||||
negative_prompt=f"negative prompt {index}",
|
||||
device=pipeline._execution_device,
|
||||
num_images_per_prompt=1,
|
||||
# pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
|
||||
do_classifier_free_guidance=True,
|
||||
),
|
||||
cutoff_step_index=index,
|
||||
) for index in range(1, 20)
|
||||
SDPromptScheduleCallback(
|
||||
prompt="Official portrait of a smiling world war ii general, female, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
|
||||
negative_prompt="Deformed, ugly, bad anatomy",
|
||||
cutoff_step_ratio=0.25,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
image = pipeline(
|
||||
prompt="prompt"
|
||||
negative_prompt="negative prompt",
|
||||
prompt="Official portrait of a smiling world war ii general, male, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
|
||||
negative_prompt="Deformed, ugly, bad anatomy",
|
||||
callback_on_step_end=callback,
|
||||
callback_on_step_end_tensor_inputs=["prompt_embeds"],
|
||||
).images[0]
|
||||
torch.cuda.empty_cache()
|
||||
image.save('image.png')
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
|
||||
from diffusers.configuration_utils import register_to_config
|
||||
import torch
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
|
||||
|
||||
class SDXLPromptSchedulingCallback(PipelineCallback):
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
|
||||
add_text_embeds: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
|
||||
add_time_ids: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
|
||||
cutoff_step_ratio=None,
|
||||
cutoff_step_index=None,
|
||||
):
|
||||
super().__init__(
|
||||
cutoff_step_ratio=cutoff_step_ratio, cutoff_step_index=cutoff_step_index
|
||||
)
|
||||
|
||||
tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
|
||||
|
||||
def callback_fn(
|
||||
self, pipeline, step_index, timestep, callback_kwargs
|
||||
) -> Dict[str, Any]:
|
||||
cutoff_step_ratio = self.config.cutoff_step_ratio
|
||||
cutoff_step_index = self.config.cutoff_step_index
|
||||
if isinstance(self.config.encoded_prompt, tuple):
|
||||
prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
|
||||
else:
|
||||
prompt_embeds = self.config.encoded_prompt
|
||||
if isinstance(self.config.add_text_embeds, tuple):
|
||||
add_text_embeds, negative_add_text_embeds = self.config.add_text_embeds
|
||||
else:
|
||||
add_text_embeds = self.config.add_text_embeds
|
||||
if isinstance(self.config.add_time_ids, tuple):
|
||||
add_time_ids, negative_add_time_ids = self.config.add_time_ids
|
||||
else:
|
||||
add_time_ids = self.config.add_time_ids
|
||||
|
||||
# Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
|
||||
cutoff_step = (
|
||||
cutoff_step_index
|
||||
if cutoff_step_index is not None
|
||||
else int(pipeline.num_timesteps * cutoff_step_ratio)
|
||||
)
|
||||
|
||||
if step_index == cutoff_step:
|
||||
if pipeline.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
add_text_embeds = torch.cat([negative_add_text_embeds, add_text_embeds])
|
||||
add_time_ids = torch.cat([negative_add_time_ids, add_time_ids])
|
||||
callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
|
||||
callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
|
||||
callback_kwargs[self.tensor_inputs[2]] = add_time_ids
|
||||
return callback_kwargs
|
||||
|
||||
|
||||
pipeline: StableDiffusionXLPipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
callbacks = []
|
||||
for index in range(1, 20):
|
||||
(
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
pooled_prompt_embeds,
|
||||
negative_pooled_prompt_embeds,
|
||||
) = pipeline.encode_prompt(
|
||||
prompt=f"prompt {index}",
|
||||
negative_prompt=f"prompt {index}",
|
||||
device=pipeline._execution_device,
|
||||
num_images_per_prompt=1,
|
||||
# pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
|
||||
do_classifier_free_guidance=True,
|
||||
)
|
||||
text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
|
||||
add_time_ids = pipeline._get_add_time_ids(
|
||||
(1024, 1024),
|
||||
(0, 0),
|
||||
(1024, 1024),
|
||||
dtype=prompt_embeds.dtype,
|
||||
text_encoder_projection_dim=text_encoder_projection_dim,
|
||||
)
|
||||
negative_add_time_ids = pipeline._get_add_time_ids(
|
||||
(1024, 1024),
|
||||
(0, 0),
|
||||
(1024, 1024),
|
||||
dtype=prompt_embeds.dtype,
|
||||
text_encoder_projection_dim=text_encoder_projection_dim,
|
||||
)
|
||||
callbacks.append(
|
||||
SDXLPromptSchedulingCallback(
|
||||
encoded_prompt=(prompt_embeds, negative_prompt_embeds),
|
||||
add_text_embeds=(pooled_prompt_embeds, negative_pooled_prompt_embeds),
|
||||
add_time_ids=(add_time_ids, negative_add_time_ids),
|
||||
cutoff_step_index=index,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
callback = MultiPipelineCallbacks(callbacks)
|
||||
|
||||
image = pipeline(
|
||||
prompt="prompt",
|
||||
negative_prompt="negative prompt",
|
||||
callback_on_step_end=callback,
|
||||
callback_on_step_end_tensor_inputs=[
|
||||
"prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
],
|
||||
).images[0]
|
||||
```
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,12 +3,13 @@ from typing import Dict, Optional
|
||||
|
||||
import torch
|
||||
import torchvision.transforms.functional as FF
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
from diffusers.utils import USE_PEFT_BACKEND
|
||||
|
||||
|
||||
try:
|
||||
@@ -16,7 +17,6 @@ try:
|
||||
except ImportError:
|
||||
Compel = None
|
||||
|
||||
KBASE = "ADDBASE"
|
||||
KCOMM = "ADDCOMM"
|
||||
KBRK = "BREAK"
|
||||
|
||||
@@ -34,11 +34,6 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
|
||||
Optional
|
||||
rp_args["save_mask"]: True/False (save masks in prompt mode)
|
||||
rp_args["power"]: int (power for attention maps in prompt mode)
|
||||
rp_args["base_ratio"]:
|
||||
float (Sets the ratio of the base prompt)
|
||||
ex) 0.2 (20%*BASE_PROMPT + 80%*REGION_PROMPT)
|
||||
[Use base prompt](https://github.com/hako-mikan/sd-webui-regional-prompter?tab=readme-ov-file#use-base-prompt)
|
||||
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
@@ -75,7 +70,6 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
scheduler: KarrasDiffusionSchedulers,
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPImageProcessor,
|
||||
image_encoder: CLIPVisionModelWithProjection = None,
|
||||
requires_safety_checker: bool = True,
|
||||
):
|
||||
super().__init__(
|
||||
@@ -86,7 +80,6 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
scheduler,
|
||||
safety_checker,
|
||||
feature_extractor,
|
||||
image_encoder,
|
||||
requires_safety_checker,
|
||||
)
|
||||
self.register_modules(
|
||||
@@ -97,7 +90,6 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -118,40 +110,17 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
rp_args: Dict[str, str] = None,
|
||||
):
|
||||
active = KBRK in prompt[0] if isinstance(prompt, list) else KBRK in prompt
|
||||
use_base = KBASE in prompt[0] if isinstance(prompt, list) else KBASE in prompt
|
||||
if negative_prompt is None:
|
||||
negative_prompt = "" if isinstance(prompt, str) else [""] * len(prompt)
|
||||
|
||||
device = self._execution_device
|
||||
regions = 0
|
||||
|
||||
self.base_ratio = float(rp_args["base_ratio"]) if "base_ratio" in rp_args else 0.0
|
||||
self.power = int(rp_args["power"]) if "power" in rp_args else 1
|
||||
|
||||
prompts = prompt if isinstance(prompt, list) else [prompt]
|
||||
n_prompts = negative_prompt if isinstance(prompt, list) else [negative_prompt]
|
||||
n_prompts = negative_prompt if isinstance(prompt, str) else [negative_prompt]
|
||||
self.batch = batch = num_images_per_prompt * len(prompts)
|
||||
|
||||
if use_base:
|
||||
bases = prompts.copy()
|
||||
n_bases = n_prompts.copy()
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
parts = prompt.split(KBASE)
|
||||
if len(parts) == 2:
|
||||
bases[i], prompts[i] = parts
|
||||
elif len(parts) > 2:
|
||||
raise ValueError(f"Multiple instances of {KBASE} found in prompt: {prompt}")
|
||||
for i, prompt in enumerate(n_prompts):
|
||||
n_parts = prompt.split(KBASE)
|
||||
if len(n_parts) == 2:
|
||||
n_bases[i], n_prompts[i] = n_parts
|
||||
elif len(n_parts) > 2:
|
||||
raise ValueError(f"Multiple instances of {KBASE} found in negative prompt: {prompt}")
|
||||
|
||||
all_bases_cn, _ = promptsmaker(bases, num_images_per_prompt)
|
||||
all_n_bases_cn, _ = promptsmaker(n_bases, num_images_per_prompt)
|
||||
|
||||
all_prompts_cn, all_prompts_p = promptsmaker(prompts, num_images_per_prompt)
|
||||
all_n_prompts_cn, _ = promptsmaker(n_prompts, num_images_per_prompt)
|
||||
|
||||
@@ -168,16 +137,8 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
|
||||
conds = getcompelembs(all_prompts_cn)
|
||||
unconds = getcompelembs(all_n_prompts_cn)
|
||||
base_embs = getcompelembs(all_bases_cn) if use_base else None
|
||||
base_n_embs = getcompelembs(all_n_bases_cn) if use_base else None
|
||||
# When using base, it seems more reasonable to use base prompts as prompt_embeddings rather than regional prompts
|
||||
embs = getcompelembs(prompts) if not use_base else base_embs
|
||||
n_embs = getcompelembs(n_prompts) if not use_base else base_n_embs
|
||||
|
||||
if use_base and self.base_ratio > 0:
|
||||
conds = self.base_ratio * base_embs + (1 - self.base_ratio) * conds
|
||||
unconds = self.base_ratio * base_n_embs + (1 - self.base_ratio) * unconds
|
||||
|
||||
embs = getcompelembs(prompts)
|
||||
n_embs = getcompelembs(n_prompts)
|
||||
prompt = negative_prompt = None
|
||||
else:
|
||||
conds = self.encode_prompt(prompts, device, 1, True)[0]
|
||||
@@ -186,18 +147,6 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
if equal
|
||||
else self.encode_prompt(all_n_prompts_cn, device, 1, True)[0]
|
||||
)
|
||||
|
||||
if use_base and self.base_ratio > 0:
|
||||
base_embs = self.encode_prompt(bases, device, 1, True)[0]
|
||||
base_n_embs = (
|
||||
self.encode_prompt(n_bases, device, 1, True)[0]
|
||||
if equal
|
||||
else self.encode_prompt(all_n_bases_cn, device, 1, True)[0]
|
||||
)
|
||||
|
||||
conds = self.base_ratio * base_embs + (1 - self.base_ratio) * conds
|
||||
unconds = self.base_ratio * base_n_embs + (1 - self.base_ratio) * unconds
|
||||
|
||||
embs = n_embs = None
|
||||
|
||||
if not active:
|
||||
@@ -276,6 +225,8 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
|
||||
residual = hidden_states
|
||||
|
||||
args = () if USE_PEFT_BACKEND else (scale,)
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
@@ -296,15 +247,16 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
args = () if USE_PEFT_BACKEND else (scale,)
|
||||
query = attn.to_q(hidden_states, *args)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
key = attn.to_k(encoder_hidden_states, *args)
|
||||
value = attn.to_v(encoder_hidden_states, *args)
|
||||
|
||||
inner_dim = key.shape[-1]
|
||||
head_dim = inner_dim // attn.heads
|
||||
@@ -331,7 +283,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
hidden_states = attn.to_out[0](hidden_states, *args)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
@@ -458,9 +410,9 @@ def promptsmaker(prompts, batch):
|
||||
add = ""
|
||||
if KCOMM in prompt:
|
||||
add, prompt = prompt.split(KCOMM)
|
||||
add = add.strip() + " "
|
||||
prompts = [p.strip() for p in prompt.split(KBRK)]
|
||||
out_p.append([add + p for i, p in enumerate(prompts)])
|
||||
add = add + " "
|
||||
prompts = prompt.split(KBRK)
|
||||
out_p.append([add + p for p in prompts])
|
||||
out = [None] * batch * len(out_p[0]) * len(out_p)
|
||||
for p, prs in enumerate(out_p): # inputs prompts
|
||||
for r, pr in enumerate(prs): # prompts for regions
|
||||
@@ -497,6 +449,7 @@ def make_cells(ratios):
|
||||
add = []
|
||||
startend(add, inratios[1:])
|
||||
icells.append(add)
|
||||
|
||||
return ocells, icells, sum(len(cell) for cell in icells)
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,5 @@
|
||||
# Based on stable_diffusion_reference.py
|
||||
|
||||
import inspect
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -8,33 +7,28 @@ import PIL.Image
|
||||
import torch
|
||||
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from diffusers.image_processor import PipelineImageInput
|
||||
from diffusers.models.attention import BasicTransformerBlock
|
||||
from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
|
||||
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
|
||||
from diffusers.utils import PIL_INTERPOLATION, deprecate, is_torch_xla_available, logging, replace_example_docstring
|
||||
from diffusers.models.unets.unet_2d_blocks import (
|
||||
CrossAttnDownBlock2D,
|
||||
CrossAttnUpBlock2D,
|
||||
DownBlock2D,
|
||||
UpBlock2D,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
|
||||
from diffusers.utils import PIL_INTERPOLATION, logging
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm # type: ignore
|
||||
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers.schedulers import UniPCMultistepScheduler
|
||||
>>> from diffusers import UniPCMultistepScheduler
|
||||
>>> from diffusers.utils import load_image
|
||||
|
||||
>>> input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_reference_input_cat.jpg")
|
||||
>>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
|
||||
|
||||
>>> pipe = StableDiffusionXLReferencePipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
@@ -44,7 +38,7 @@ EXAMPLE_DOC_STRING = """
|
||||
|
||||
>>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
>>> result_img = pipe(ref_image=input_image,
|
||||
prompt="a dog",
|
||||
prompt="1girl",
|
||||
num_inference_steps=20,
|
||||
reference_attn=True,
|
||||
reference_adain=True).images[0]
|
||||
@@ -62,6 +56,8 @@ def torch_dfs(model: torch.nn.Module):
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
||||
|
||||
|
||||
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
"""
|
||||
Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
|
||||
@@ -76,102 +72,33 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
return noise_cfg
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
||||
def retrieve_timesteps(
|
||||
scheduler,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
||||
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
||||
|
||||
Args:
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
||||
`num_inference_steps` and `sigmas` must be `None`.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
||||
`num_inference_steps` and `timesteps` must be `None`.
|
||||
|
||||
Returns:
|
||||
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
||||
second element is the number of inference steps.
|
||||
"""
|
||||
if timesteps is not None and sigmas is not None:
|
||||
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
||||
if timesteps is not None:
|
||||
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accepts_timesteps:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" timestep schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
elif sigmas is not None:
|
||||
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accept_sigmas:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
else:
|
||||
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
|
||||
refimage = refimage.to(device=device)
|
||||
if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
|
||||
self.upcast_vae()
|
||||
refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
if refimage.dtype != self.vae.dtype:
|
||||
refimage = refimage.to(dtype=self.vae.dtype)
|
||||
# encode the mask image into latents space so we can concatenate it to the latents
|
||||
if isinstance(generator, list):
|
||||
ref_image_latents = [
|
||||
self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
|
||||
for i in range(batch_size)
|
||||
]
|
||||
ref_image_latents = torch.cat(ref_image_latents, dim=0)
|
||||
else:
|
||||
ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
|
||||
ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
|
||||
def _default_height_width(self, height, width, image):
|
||||
# NOTE: It is possible that a list of images have different
|
||||
# dimensions for each image, so just checking the first image
|
||||
# is not _exactly_ correct, but it is simple.
|
||||
while isinstance(image, list):
|
||||
image = image[0]
|
||||
|
||||
# duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
|
||||
if ref_image_latents.shape[0] < batch_size:
|
||||
if not batch_size % ref_image_latents.shape[0] == 0:
|
||||
raise ValueError(
|
||||
"The passed images and the required batch size don't match. Images are supposed to be duplicated"
|
||||
f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
|
||||
" Make sure the number of images that you pass is divisible by the total requested batch size."
|
||||
)
|
||||
ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
|
||||
if height is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
height = image.height
|
||||
elif isinstance(image, torch.Tensor):
|
||||
height = image.shape[2]
|
||||
|
||||
ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
|
||||
height = (height // 8) * 8 # round down to nearest multiple of 8
|
||||
|
||||
# aligning device to prevent device errors when concating it with the latent model input
|
||||
ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
|
||||
return ref_image_latents
|
||||
if width is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
width = image.width
|
||||
elif isinstance(image, torch.Tensor):
|
||||
width = image.shape[3]
|
||||
|
||||
def prepare_ref_image(
|
||||
width = (width // 8) * 8
|
||||
|
||||
return height, width
|
||||
|
||||
def prepare_image(
|
||||
self,
|
||||
image,
|
||||
width,
|
||||
@@ -224,42 +151,41 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
|
||||
return image
|
||||
|
||||
def check_ref_inputs(
|
||||
self,
|
||||
ref_image,
|
||||
reference_guidance_start,
|
||||
reference_guidance_end,
|
||||
style_fidelity,
|
||||
reference_attn,
|
||||
reference_adain,
|
||||
):
|
||||
ref_image_is_pil = isinstance(ref_image, PIL.Image.Image)
|
||||
ref_image_is_tensor = isinstance(ref_image, torch.Tensor)
|
||||
def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
|
||||
refimage = refimage.to(device=device)
|
||||
if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
|
||||
self.upcast_vae()
|
||||
refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
if refimage.dtype != self.vae.dtype:
|
||||
refimage = refimage.to(dtype=self.vae.dtype)
|
||||
# encode the mask image into latents space so we can concatenate it to the latents
|
||||
if isinstance(generator, list):
|
||||
ref_image_latents = [
|
||||
self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i])
|
||||
for i in range(batch_size)
|
||||
]
|
||||
ref_image_latents = torch.cat(ref_image_latents, dim=0)
|
||||
else:
|
||||
ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator)
|
||||
ref_image_latents = self.vae.config.scaling_factor * ref_image_latents
|
||||
|
||||
if not ref_image_is_pil and not ref_image_is_tensor:
|
||||
raise TypeError(
|
||||
f"ref image must be passed and be one of PIL image or torch tensor, but is {type(ref_image)}"
|
||||
)
|
||||
# duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method
|
||||
if ref_image_latents.shape[0] < batch_size:
|
||||
if not batch_size % ref_image_latents.shape[0] == 0:
|
||||
raise ValueError(
|
||||
"The passed images and the required batch size don't match. Images are supposed to be duplicated"
|
||||
f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed."
|
||||
" Make sure the number of images that you pass is divisible by the total requested batch size."
|
||||
)
|
||||
ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1)
|
||||
|
||||
if not reference_attn and not reference_adain:
|
||||
raise ValueError("`reference_attn` or `reference_adain` must be True.")
|
||||
ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents
|
||||
|
||||
if style_fidelity < 0.0:
|
||||
raise ValueError(f"style fidelity: {style_fidelity} can't be smaller than 0.")
|
||||
if style_fidelity > 1.0:
|
||||
raise ValueError(f"style fidelity: {style_fidelity} can't be larger than 1.0.")
|
||||
|
||||
if reference_guidance_start >= reference_guidance_end:
|
||||
raise ValueError(
|
||||
f"reference guidance start: {reference_guidance_start} cannot be larger or equal to reference guidance end: {reference_guidance_end}."
|
||||
)
|
||||
if reference_guidance_start < 0.0:
|
||||
raise ValueError(f"reference guidance start: {reference_guidance_start} can't be smaller than 0.")
|
||||
if reference_guidance_end > 1.0:
|
||||
raise ValueError(f"reference guidance end: {reference_guidance_end} can't be larger than 1.0.")
|
||||
# aligning device to prevent device errors when concating it with the latent model input
|
||||
ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
|
||||
return ref_image_latents
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
@@ -268,8 +194,6 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: List[int] = None,
|
||||
sigmas: List[float] = None,
|
||||
denoising_end: Optional[float] = None,
|
||||
guidance_scale: float = 5.0,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -282,220 +206,28 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
||||
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
||||
callback_steps: int = 1,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
guidance_rescale: float = 0.0,
|
||||
original_size: Optional[Tuple[int, int]] = None,
|
||||
crops_coords_top_left: Tuple[int, int] = (0, 0),
|
||||
target_size: Optional[Tuple[int, int]] = None,
|
||||
negative_original_size: Optional[Tuple[int, int]] = None,
|
||||
negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
|
||||
negative_target_size: Optional[Tuple[int, int]] = None,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
attention_auto_machine_weight: float = 1.0,
|
||||
gn_auto_machine_weight: float = 1.0,
|
||||
reference_guidance_start: float = 0.0,
|
||||
reference_guidance_end: float = 1.0,
|
||||
style_fidelity: float = 0.5,
|
||||
reference_attn: bool = True,
|
||||
reference_adain: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
||||
instead.
|
||||
prompt_2 (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
||||
used in both text-encoders
|
||||
ref_image (`torch.Tensor`, `PIL.Image.Image`):
|
||||
The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If
|
||||
the type is specified as `Torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can
|
||||
also be accepted as an image.
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
Anything below 512 pixels won't work well for
|
||||
[stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
||||
and checkpoints that are not specifically fine-tuned on low resolutions.
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
Anything below 512 pixels won't work well for
|
||||
[stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
||||
and checkpoints that are not specifically fine-tuned on low resolutions.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
||||
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
||||
will be used.
|
||||
denoising_end (`float`, *optional*):
|
||||
When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
|
||||
completed before it is intentionally prematurely terminated. As a result, the returned sample will
|
||||
still retain a substantial amount of noise as determined by the discrete timesteps selected by the
|
||||
scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
|
||||
"Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
|
||||
Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
|
||||
guidance_scale (`float`, *optional*, defaults to 5.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
less than `1`).
|
||||
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
||||
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
||||
to make generation deterministic.
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
||||
argument.
|
||||
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
||||
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
||||
of a plain tuple.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
||||
`self.processor` in
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
|
||||
If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
|
||||
`original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
|
||||
explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
|
||||
crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
`crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
|
||||
`crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
|
||||
`crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
|
||||
target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
|
||||
For most cases, `target_size` should be set to the desired height and width of the generated image. If
|
||||
not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
|
||||
section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
|
||||
negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
|
||||
To negatively condition the generation process based on a specific image resolution. Part of SDXL's
|
||||
micro-conditioning as explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
|
||||
information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
|
||||
negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
|
||||
micro-conditioning as explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
|
||||
information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
|
||||
negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
|
||||
To negatively condition the generation process based on a target image resolution. It should be as same
|
||||
as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
|
||||
information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
attention_auto_machine_weight (`float`):
|
||||
Weight of using reference query for self attention's context.
|
||||
If attention_auto_machine_weight=1.0, use reference query for all self attention's context.
|
||||
gn_auto_machine_weight (`float`):
|
||||
Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins.
|
||||
reference_guidance_start (`float`, *optional*, defaults to 0.0):
|
||||
The percentage of total steps at which the reference ControlNet starts applying.
|
||||
reference_guidance_end (`float`, *optional*, defaults to 1.0):
|
||||
The percentage of total steps at which the reference ControlNet stops applying.
|
||||
style_fidelity (`float`):
|
||||
style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important,
|
||||
elif style_fidelity=0.0, prompt more important, else balanced.
|
||||
reference_attn (`bool`):
|
||||
Whether to use reference query for self attention's context.
|
||||
reference_adain (`bool`):
|
||||
Whether to use reference adain.
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
|
||||
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
||||
"""
|
||||
|
||||
callback = kwargs.pop("callback", None)
|
||||
callback_steps = kwargs.pop("callback_steps", None)
|
||||
|
||||
if callback is not None:
|
||||
deprecate(
|
||||
"callback",
|
||||
"1.0.0",
|
||||
"Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
if callback_steps is not None:
|
||||
deprecate(
|
||||
"callback_steps",
|
||||
"1.0.0",
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True."
|
||||
|
||||
# 0. Default height and width to unet
|
||||
# height, width = self._default_height_width(height, width, ref_image)
|
||||
|
||||
height = height or self.default_sample_size * self.vae_scale_factor
|
||||
width = width or self.default_sample_size * self.vae_scale_factor
|
||||
|
||||
original_size = original_size or (height, width)
|
||||
target_size = target_size or (height, width)
|
||||
|
||||
@@ -512,27 +244,8 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
negative_prompt_embeds,
|
||||
pooled_prompt_embeds,
|
||||
negative_pooled_prompt_embeds,
|
||||
ip_adapter_image,
|
||||
ip_adapter_image_embeds,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
)
|
||||
|
||||
self.check_ref_inputs(
|
||||
ref_image,
|
||||
reference_guidance_start,
|
||||
reference_guidance_end,
|
||||
style_fidelity,
|
||||
reference_attn,
|
||||
reference_adain,
|
||||
)
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._guidance_rescale = guidance_rescale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._denoising_end = denoising_end
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -543,11 +256,15 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
|
||||
device = self._execution_device
|
||||
|
||||
# 3. Encode input prompt
|
||||
lora_scale = (
|
||||
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
|
||||
)
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
|
||||
# 3. Encode input prompt
|
||||
text_encoder_lora_scale = (
|
||||
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
||||
)
|
||||
(
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
@@ -558,19 +275,17 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
prompt_2=prompt_2,
|
||||
device=device,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
do_classifier_free_guidance=do_classifier_free_guidance,
|
||||
negative_prompt=negative_prompt,
|
||||
negative_prompt_2=negative_prompt_2,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
||||
lora_scale=lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
)
|
||||
|
||||
# 4. Preprocess reference image
|
||||
ref_image = self.prepare_ref_image(
|
||||
ref_image = self.prepare_image(
|
||||
image=ref_image,
|
||||
width=width,
|
||||
height=height,
|
||||
@@ -581,9 +296,9 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
)
|
||||
|
||||
# 5. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
||||
)
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
# 6. Prepare latent variables
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
@@ -597,7 +312,6 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
generator,
|
||||
latents,
|
||||
)
|
||||
|
||||
# 7. Prepare reference latent variables
|
||||
ref_image_latents = self.prepare_ref_latents(
|
||||
ref_image,
|
||||
@@ -605,21 +319,13 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
self.do_classifier_free_guidance,
|
||||
do_classifier_free_guidance,
|
||||
)
|
||||
|
||||
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 8.1 Create tensor stating which reference controlnets to keep
|
||||
reference_keeps = []
|
||||
for i in range(len(timesteps)):
|
||||
reference_keep = 1.0 - float(
|
||||
i / len(timesteps) < reference_guidance_start or (i + 1) / len(timesteps) > reference_guidance_end
|
||||
)
|
||||
reference_keeps.append(reference_keep)
|
||||
|
||||
# 8.2 Modify self attention and group norm
|
||||
# 9. Modify self attebtion and group norm
|
||||
MODE = "write"
|
||||
uc_mask = (
|
||||
torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt)
|
||||
@@ -627,8 +333,6 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
.bool()
|
||||
)
|
||||
|
||||
do_classifier_free_guidance = self.do_classifier_free_guidance
|
||||
|
||||
def hacked_basic_transformer_inner_forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -900,7 +604,7 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
return hidden_states
|
||||
|
||||
def hacked_UpBlock2D_forward(
|
||||
self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, *args, **kwargs
|
||||
self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, **kwargs
|
||||
):
|
||||
eps = 1e-6
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
@@ -980,7 +684,7 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
module.var_bank = []
|
||||
module.gn_weight *= 2
|
||||
|
||||
# 9. Prepare added time ids & embeddings
|
||||
# 10. Prepare added time ids & embeddings
|
||||
add_text_embeds = pooled_prompt_embeds
|
||||
if self.text_encoder_2 is None:
|
||||
text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
|
||||
@@ -994,101 +698,62 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
dtype=prompt_embeds.dtype,
|
||||
text_encoder_projection_dim=text_encoder_projection_dim,
|
||||
)
|
||||
if negative_original_size is not None and negative_target_size is not None:
|
||||
negative_add_time_ids = self._get_add_time_ids(
|
||||
negative_original_size,
|
||||
negative_crops_coords_top_left,
|
||||
negative_target_size,
|
||||
dtype=prompt_embeds.dtype,
|
||||
text_encoder_projection_dim=text_encoder_projection_dim,
|
||||
)
|
||||
else:
|
||||
negative_add_time_ids = add_time_ids
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
||||
add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
|
||||
add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
|
||||
add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
|
||||
|
||||
prompt_embeds = prompt_embeds.to(device)
|
||||
add_text_embeds = add_text_embeds.to(device)
|
||||
add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
|
||||
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image,
|
||||
ip_adapter_image_embeds,
|
||||
device,
|
||||
batch_size * num_images_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
)
|
||||
|
||||
# 10. Denoising loop
|
||||
# 11. Denoising loop
|
||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||
|
||||
# 10.1 Apply denoising_end
|
||||
if (
|
||||
self.denoising_end is not None
|
||||
and isinstance(self.denoising_end, float)
|
||||
and self.denoising_end > 0
|
||||
and self.denoising_end < 1
|
||||
):
|
||||
if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
- (self.denoising_end * self.scheduler.config.num_train_timesteps)
|
||||
- (denoising_end * self.scheduler.config.num_train_timesteps)
|
||||
)
|
||||
)
|
||||
num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
|
||||
timesteps = timesteps[:num_inference_steps]
|
||||
|
||||
# 11. Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
added_cond_kwargs["image_embeds"] = image_embeds
|
||||
|
||||
# ref only part
|
||||
if reference_keeps[i] > 0:
|
||||
noise = randn_tensor(
|
||||
ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
|
||||
)
|
||||
ref_xt = self.scheduler.add_noise(
|
||||
ref_image_latents,
|
||||
noise,
|
||||
t.reshape(
|
||||
1,
|
||||
),
|
||||
)
|
||||
ref_xt = self.scheduler.scale_model_input(ref_xt, t)
|
||||
noise = randn_tensor(
|
||||
ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype
|
||||
)
|
||||
ref_xt = self.scheduler.add_noise(
|
||||
ref_image_latents,
|
||||
noise,
|
||||
t.reshape(
|
||||
1,
|
||||
),
|
||||
)
|
||||
ref_xt = self.scheduler.scale_model_input(ref_xt, t)
|
||||
|
||||
MODE = "write"
|
||||
self.unet(
|
||||
ref_xt,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
)
|
||||
MODE = "write"
|
||||
|
||||
self.unet(
|
||||
ref_xt,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
)
|
||||
|
||||
# predict the noise residual
|
||||
MODE = "read"
|
||||
@@ -1096,44 +761,22 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
# perform guidance
|
||||
if self.do_classifier_free_guidance:
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
||||
if do_classifier_free_guidance and guidance_rescale > 0.0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
||||
negative_pooled_prompt_embeds = callback_outputs.pop(
|
||||
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
||||
)
|
||||
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
||||
negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
@@ -1142,9 +785,6 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
step_idx = i // getattr(self.scheduler, "order", 1)
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
if not output_type == "latent":
|
||||
# make sure the VAE is in float32 mode, as it overflows in float16
|
||||
needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
|
||||
@@ -1152,43 +792,25 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
if needs_upcasting:
|
||||
self.upcast_vae()
|
||||
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
elif latents.dtype != self.vae.dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
self.vae = self.vae.to(latents.dtype)
|
||||
|
||||
# unscale/denormalize the latents
|
||||
# denormalize with the mean and std if available and not None
|
||||
has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
|
||||
has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
|
||||
if has_latents_mean and has_latents_std:
|
||||
latents_mean = (
|
||||
torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
|
||||
)
|
||||
latents_std = (
|
||||
torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
|
||||
)
|
||||
latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
|
||||
else:
|
||||
latents = latents / self.vae.config.scaling_factor
|
||||
|
||||
image = self.vae.decode(latents, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
|
||||
# cast back to fp16 if needed
|
||||
if needs_upcasting:
|
||||
self.vae.to(dtype=torch.float16)
|
||||
else:
|
||||
image = latents
|
||||
return StableDiffusionXLPipelineOutput(images=image)
|
||||
|
||||
if not output_type == "latent":
|
||||
# apply watermark if available
|
||||
if self.watermark is not None:
|
||||
image = self.watermark.apply_watermark(image)
|
||||
# apply watermark if available
|
||||
if self.watermark is not None:
|
||||
image = self.watermark.apply_watermark(image)
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
# Offload last model to CPU
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.final_offload_hook.offload()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# ControlNet training example for Stable Diffusion 3/3.5 (SD3/3.5)
|
||||
# ControlNet training example for Stable Diffusion 3 (SD3)
|
||||
|
||||
The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) and [Stable Diffusion 3.5](https://stability.ai/news/introducing-stable-diffusion-3-5).
|
||||
The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
|
||||
|
||||
## Running locally with PyTorch
|
||||
|
||||
@@ -51,9 +51,9 @@ Please download the dataset and unzip it in the directory `fill50k` in the `exam
|
||||
|
||||
## Training
|
||||
|
||||
First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or the SD3.5 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). We will use it as a base model for the ControlNet training.
|
||||
First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium). We will use it as a base model for the ControlNet training.
|
||||
> [!NOTE]
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
@@ -90,8 +90,6 @@ accelerate launch train_controlnet_sd3.py \
|
||||
--gradient_accumulation_steps=4
|
||||
```
|
||||
|
||||
To train a ControlNet model for Stable Diffusion 3.5, replace the `MODEL_DIR` with `stabilityai/stable-diffusion-3.5-medium`.
|
||||
|
||||
To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
Our experiments were conducted on a single 40GB A100 GPU.
|
||||
@@ -126,8 +124,6 @@ image = pipe(
|
||||
image.save("./output.png")
|
||||
```
|
||||
|
||||
Similarly, for SD3.5, replace the `base_model_path` with `stabilityai/stable-diffusion-3.5-medium` and controlnet_path `DavyMorgan/sd35-controlnet-out'.
|
||||
|
||||
## Notes
|
||||
|
||||
### GPU usage
|
||||
@@ -139,8 +135,6 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin
|
||||
|
||||
## Example results
|
||||
|
||||
### SD3
|
||||
|
||||
#### After 500 steps with batch size 8
|
||||
|
||||
| | |
|
||||
@@ -156,20 +150,3 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin
|
||||
|| pale golden rod circle with old lace background |
|
||||
 |  |
|
||||
|
||||
### SD3.5
|
||||
|
||||
#### After 500 steps with batch size 8
|
||||
|
||||
| | |
|
||||
|-------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
|| pale golden rod circle with old lace background |
|
||||
 |  |
|
||||
|
||||
|
||||
#### After 3000 steps with batch size 8:
|
||||
|
||||
| | |
|
||||
|-------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
|| pale golden rod circle with old lace background |
|
||||
 |  |
|
||||
|
||||
|
||||
@@ -138,27 +138,6 @@ class ControlNetSD3(ExamplesTestsAccelerate):
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
|
||||
|
||||
|
||||
class ControlNetSD35(ExamplesTestsAccelerate):
|
||||
def test_controlnet_sd3(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
examples/controlnet/train_controlnet_sd3.py
|
||||
--pretrained_model_name_or_path=hf-internal-testing/tiny-sd35-pipe
|
||||
--dataset_name=hf-internal-testing/fill10
|
||||
--output_dir={tmpdir}
|
||||
--resolution=64
|
||||
--train_batch_size=1
|
||||
--gradient_accumulation_steps=1
|
||||
--controlnet_model_name_or_path=DavyMorgan/tiny-controlnet-sd35
|
||||
--max_train_steps=4
|
||||
--checkpointing_steps=2
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
|
||||
|
||||
|
||||
class ControlNetflux(ExamplesTestsAccelerate):
|
||||
def test_controlnet_flux(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -571,6 +571,9 @@ def parse_args(input_args=None):
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
|
||||
|
||||
if args.dataset_name is not None and args.train_data_dir is not None:
|
||||
raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
|
||||
|
||||
if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
|
||||
raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
|
||||
|
||||
@@ -612,7 +615,6 @@ def make_train_dataset(args, tokenizer, accelerator):
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
cache_dir=args.cache_dir,
|
||||
data_dir=args.train_data_dir,
|
||||
)
|
||||
else:
|
||||
if args.train_data_dir is not None:
|
||||
|
||||
@@ -263,12 +263,6 @@ def parse_args(input_args=None):
|
||||
help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
|
||||
" If not specified controlnet weights are initialized from unet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_extra_conditioning_channels",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of extra conditioning channels for controlnet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
@@ -545,9 +539,6 @@ def parse_args(input_args=None):
|
||||
default=77,
|
||||
help="Maximum sequence length to use with with the T5 text encoder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_preprocess_batch_size", type=int, default=1000, help="Batch size for preprocessing dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--validation_prompt",
|
||||
type=str,
|
||||
@@ -995,9 +986,7 @@ def main(args):
|
||||
controlnet = SD3ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
|
||||
else:
|
||||
logger.info("Initializing controlnet weights from transformer")
|
||||
controlnet = SD3ControlNetModel.from_transformer(
|
||||
transformer, num_extra_conditioning_channels=args.num_extra_conditioning_channels
|
||||
)
|
||||
controlnet = SD3ControlNetModel.from_transformer(transformer)
|
||||
|
||||
transformer.requires_grad_(False)
|
||||
vae.requires_grad_(False)
|
||||
@@ -1134,12 +1123,7 @@ def main(args):
|
||||
# fingerprint used by the cache for the other processes to load the result
|
||||
# details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
|
||||
new_fingerprint = Hasher.hash(args)
|
||||
train_dataset = train_dataset.map(
|
||||
compute_embeddings_fn,
|
||||
batched=True,
|
||||
batch_size=args.dataset_preprocess_batch_size,
|
||||
new_fingerprint=new_fingerprint,
|
||||
)
|
||||
train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
|
||||
|
||||
del text_encoder_one, text_encoder_two, text_encoder_three
|
||||
del tokenizer_one, tokenizer_two, tokenizer_three
|
||||
|
||||
@@ -598,6 +598,9 @@ def parse_args(input_args=None):
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
|
||||
|
||||
if args.dataset_name is not None and args.train_data_dir is not None:
|
||||
raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
|
||||
|
||||
if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
|
||||
raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
|
||||
|
||||
@@ -639,7 +642,6 @@ def get_train_dataset(args, accelerator):
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
cache_dir=args.cache_dir,
|
||||
data_dir=args.train_data_dir,
|
||||
)
|
||||
else:
|
||||
if args.train_data_dir is not None:
|
||||
|
||||
@@ -118,7 +118,7 @@ accelerate launch train_dreambooth_flux.py \
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
|
||||
* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
|
||||
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
> [!NOTE]
|
||||
|
||||
@@ -105,7 +105,7 @@ accelerate launch train_dreambooth_sd3.py \
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
|
||||
* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
|
||||
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
> [!NOTE]
|
||||
|
||||
@@ -99,7 +99,7 @@ accelerate launch train_dreambooth_lora_sdxl.py \
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
|
||||
* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
|
||||
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
Our experiments were conducted on a single 40GB A100 GPU.
|
||||
|
||||
@@ -1300,17 +1300,16 @@ def main(args):
|
||||
# Since we predict the noise instead of x_0, the original formulation is slightly changed.
|
||||
# This is discussed in Section 4.2 of the same paper.
|
||||
snr = compute_snr(noise_scheduler, timesteps)
|
||||
base_weight = (
|
||||
torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
|
||||
)
|
||||
|
||||
if noise_scheduler.config.prediction_type == "v_prediction":
|
||||
# Velocity objective needs to be floored to an SNR weight of one.
|
||||
divisor = snr + 1
|
||||
mse_loss_weights = base_weight + 1
|
||||
else:
|
||||
divisor = snr
|
||||
|
||||
mse_loss_weights = (
|
||||
torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / divisor
|
||||
)
|
||||
|
||||
# Epsilon and sample both use the same loss weights.
|
||||
mse_loss_weights = base_weight
|
||||
loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
|
||||
loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
|
||||
loss = loss.mean()
|
||||
|
||||
@@ -1294,13 +1294,10 @@ def main(args):
|
||||
for model in models:
|
||||
if isinstance(model, type(unwrap_model(transformer))):
|
||||
transformer_lora_layers_to_save = get_peft_model_state_dict(model)
|
||||
elif isinstance(model, type(unwrap_model(text_encoder_one))): # or text_encoder_two
|
||||
# both text encoders are of the same class, so we check hidden size to distinguish between the two
|
||||
hidden_size = unwrap_model(model).config.hidden_size
|
||||
if hidden_size == 768:
|
||||
text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
|
||||
elif hidden_size == 1280:
|
||||
text_encoder_two_lora_layers_to_save = get_peft_model_state_dict(model)
|
||||
elif isinstance(model, type(unwrap_model(text_encoder_one))):
|
||||
text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
|
||||
elif isinstance(model, type(unwrap_model(text_encoder_two))):
|
||||
text_encoder_two_lora_layers_to_save = get_peft_model_state_dict(model)
|
||||
else:
|
||||
raise ValueError(f"unexpected save model: {model.__class__}")
|
||||
|
||||
|
||||
@@ -1,204 +0,0 @@
|
||||
# Training Flux Control
|
||||
|
||||
This (experimental) example shows how to train Control LoRAs with [Flux](https://huggingface.co/black-forest-labs/FLUX.1-dev) by conditioning it with additional structural controls (like depth maps, poses, etc.). We provide a script for full fine-tuning, too, refer to [this section](#full-fine-tuning). To know more about Flux Control family, refer to the following resources:
|
||||
|
||||
* [Docs](https://github.com/black-forest-labs/flux/blob/main/docs/structural-conditioning.md) by Black Forest Labs
|
||||
* Diffusers docs ([1](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#canny-control), [2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#depth-control))
|
||||
|
||||
To incorporate additional condition latents, we expand the input features of Flux.1-Dev from 64 to 128. The first 64 channels correspond to the original input latents to be denoised, while the latter 64 channels correspond to control latents. This expansion happens on the `x_embedder` layer, where the combined latents are projected to the expected feature dimension of rest of the network. Inference is performed using the `FluxControlPipeline`.
|
||||
|
||||
> [!NOTE]
|
||||
> **Gated model**
|
||||
>
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
The example command below shows how to launch fine-tuning for pose conditions. The dataset ([`raulc0399/open_pose_controlnet`](https://huggingface.co/datasets/raulc0399/open_pose_controlnet)) being used here already has the pose conditions of the original images, so we don't have to compute them.
|
||||
|
||||
```bash
|
||||
accelerate launch train_control_lora_flux.py \
|
||||
--pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
|
||||
--dataset_name="raulc0399/open_pose_controlnet" \
|
||||
--output_dir="pose-control-lora" \
|
||||
--mixed_precision="bf16" \
|
||||
--train_batch_size=1 \
|
||||
--rank=64 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--learning_rate=1e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=5000 \
|
||||
--validation_image="openpose.png" \
|
||||
--validation_prompt="A couple, 4k photo, highly detailed" \
|
||||
--offload \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
`openpose.png` comes from [here](https://huggingface.co/Adapter/t2iadapter/resolve/main/openpose.png).
|
||||
|
||||
You need to install `diffusers` from the branch of [this PR](https://github.com/huggingface/diffusers/pull/9999). When it's merged, you should install `diffusers` from the `main`.
|
||||
|
||||
The training script exposes additional CLI args that might be useful to experiment with:
|
||||
|
||||
* `use_lora_bias`: When set, additionally trains the biases of the `lora_B` layer.
|
||||
* `train_norm_layers`: When set, additionally trains the normalization scales. Takes care of saving and loading.
|
||||
* `lora_layers`: Specify the layers you want to apply LoRA to. If you specify "all-linear", all the linear layers will be LoRA-attached.
|
||||
|
||||
### Training with DeepSpeed
|
||||
|
||||
It's possible to train with [DeepSpeed](https://github.com/microsoft/DeepSpeed), specifically leveraging the Zero2 system optimization. To use it, save the following config to an YAML file (feel free to modify as needed):
|
||||
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 1.0
|
||||
offload_optimizer_device: cpu
|
||||
offload_param_device: cpu
|
||||
zero3_init_flag: false
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 1
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
And then while launching training, pass the config file:
|
||||
|
||||
```bash
|
||||
accelerate launch --config_file=CONFIG_FILE.yaml ...
|
||||
```
|
||||
|
||||
### Inference
|
||||
|
||||
The pose images in our dataset were computed using the [`controlnet_aux`](https://github.com/huggingface/controlnet_aux) library. Let's install it first:
|
||||
|
||||
```bash
|
||||
pip install controlnet_aux
|
||||
```
|
||||
|
||||
And then we are ready:
|
||||
|
||||
```py
|
||||
from controlnet_aux import OpenposeDetector
|
||||
from diffusers import FluxControlPipeline
|
||||
from diffusers.utils import load_image
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
|
||||
pipe.load_lora_weights("...") # change this.
|
||||
|
||||
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
|
||||
|
||||
# prepare pose condition.
|
||||
url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/people.jpg"
|
||||
image = load_image(url)
|
||||
image = open_pose(image, detect_resolution=512, image_resolution=1024)
|
||||
image = np.array(image)[:, :, ::-1]
|
||||
image = Image.fromarray(np.uint8(image))
|
||||
|
||||
prompt = "A couple, 4k photo, highly detailed"
|
||||
|
||||
gen_images = pipe(
|
||||
prompt=prompt,
|
||||
condition_image=image,
|
||||
num_inference_steps=50,
|
||||
joint_attention_kwargs={"scale": 0.9},
|
||||
guidance_scale=25.,
|
||||
).images[0]
|
||||
gen_images.save("output.png")
|
||||
```
|
||||
|
||||
## Full fine-tuning
|
||||
|
||||
We provide a non-LoRA version of the training script `train_control_flux.py`. Here is an example command:
|
||||
|
||||
```bash
|
||||
accelerate launch --config_file=accelerate_ds2.yaml train_control_flux.py \
|
||||
--pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
|
||||
--dataset_name="raulc0399/open_pose_controlnet" \
|
||||
--output_dir="pose-control" \
|
||||
--mixed_precision="bf16" \
|
||||
--train_batch_size=2 \
|
||||
--dataloader_num_workers=4 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--proportion_empty_prompts=0.2 \
|
||||
--learning_rate=5e-5 \
|
||||
--adam_weight_decay=1e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="cosine" \
|
||||
--lr_warmup_steps=1000 \
|
||||
--checkpointing_steps=1000 \
|
||||
--max_train_steps=10000 \
|
||||
--validation_steps=200 \
|
||||
--validation_image "2_pose_1024.jpg" "3_pose_1024.jpg" \
|
||||
--validation_prompt "two friends sitting by each other enjoying a day at the park, full hd, cinematic" "person enjoying a day at the park, full hd, cinematic" \
|
||||
--offload \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
Change the `validation_image` and `validation_prompt` as needed.
|
||||
|
||||
For inference, this time, we will run:
|
||||
|
||||
```py
|
||||
from controlnet_aux import OpenposeDetector
|
||||
from diffusers import FluxControlPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils import load_image
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
transformer = FluxTransformer2DModel.from_pretrained("...") # change this.
|
||||
pipe = FluxControlPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
|
||||
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
|
||||
|
||||
# prepare pose condition.
|
||||
url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/people.jpg"
|
||||
image = load_image(url)
|
||||
image = open_pose(image, detect_resolution=512, image_resolution=1024)
|
||||
image = np.array(image)[:, :, ::-1]
|
||||
image = Image.fromarray(np.uint8(image))
|
||||
|
||||
prompt = "A couple, 4k photo, highly detailed"
|
||||
|
||||
gen_images = pipe(
|
||||
prompt=prompt,
|
||||
condition_image=image,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=25.,
|
||||
).images[0]
|
||||
gen_images.save("output.png")
|
||||
```
|
||||
|
||||
## Things to note
|
||||
|
||||
* The scripts provided in this directory are experimental and educational. This means we may have to tweak things around to get good results on a given condition. We believe this is best done with the community 🤗
|
||||
* The scripts are not memory-optimized but we offload the VAE and the text encoders to CPU when they are not used.
|
||||
* We can extract LoRAs from the fully fine-tuned model. While we currently don't provide any utilities for that, users are welcome to refer to [this script](https://github.com/Stability-AI/stability-ComfyUI-nodes/blob/master/control_lora_create.py) that provides a similar functionality.
|
||||
@@ -1,6 +0,0 @@
|
||||
transformers==4.47.0
|
||||
wandb
|
||||
torch
|
||||
torchvision
|
||||
accelerate==1.2.0
|
||||
peft>=0.14.0
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,175 +0,0 @@
|
||||
# Search models on Civitai and Hugging Face
|
||||
|
||||
The [auto_diffusers](https://github.com/suzukimain/auto_diffusers) library provides additional functionalities to Diffusers such as searching for models on Civitai and the Hugging Face Hub.
|
||||
Please refer to the original library [here](https://pypi.org/project/auto-diffusers/)
|
||||
|
||||
## Installation
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
> [!IMPORTANT]
|
||||
> To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the installation up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install .
|
||||
```
|
||||
Set up the pipeline. You can also cd to this folder and run it.
|
||||
```bash
|
||||
!wget https://raw.githubusercontent.com/suzukimain/auto_diffusers/refs/heads/master/src/auto_diffusers/pipeline_easy.py
|
||||
```
|
||||
|
||||
## Load from Civitai
|
||||
```python
|
||||
from pipeline_easy import (
|
||||
EasyPipelineForText2Image,
|
||||
EasyPipelineForImage2Image,
|
||||
EasyPipelineForInpainting,
|
||||
)
|
||||
|
||||
# Text-to-Image
|
||||
pipeline = EasyPipelineForText2Image.from_civitai(
|
||||
"search_word",
|
||||
base_model="SD 1.5",
|
||||
).to("cuda")
|
||||
|
||||
|
||||
# Image-to-Image
|
||||
pipeline = EasyPipelineForImage2Image.from_civitai(
|
||||
"search_word",
|
||||
base_model="SD 1.5",
|
||||
).to("cuda")
|
||||
|
||||
|
||||
# Inpainting
|
||||
pipeline = EasyPipelineForInpainting.from_civitai(
|
||||
"search_word",
|
||||
base_model="SD 1.5",
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
## Load from Hugging Face
|
||||
```python
|
||||
from pipeline_easy import (
|
||||
EasyPipelineForText2Image,
|
||||
EasyPipelineForImage2Image,
|
||||
EasyPipelineForInpainting,
|
||||
)
|
||||
|
||||
# Text-to-Image
|
||||
pipeline = EasyPipelineForText2Image.from_huggingface(
|
||||
"search_word",
|
||||
checkpoint_format="diffusers",
|
||||
).to("cuda")
|
||||
|
||||
|
||||
# Image-to-Image
|
||||
pipeline = EasyPipelineForImage2Image.from_huggingface(
|
||||
"search_word",
|
||||
checkpoint_format="diffusers",
|
||||
).to("cuda")
|
||||
|
||||
|
||||
# Inpainting
|
||||
pipeline = EasyPipelineForInpainting.from_huggingface(
|
||||
"search_word",
|
||||
checkpoint_format="diffusers",
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
|
||||
## Search Civitai and Huggingface
|
||||
|
||||
```python
|
||||
from pipeline_easy import (
|
||||
search_huggingface,
|
||||
search_civitai,
|
||||
)
|
||||
|
||||
# Search Lora
|
||||
Lora = search_civitai(
|
||||
"Keyword_to_search_Lora",
|
||||
model_type="LORA",
|
||||
base_model = "SD 1.5",
|
||||
download=True,
|
||||
)
|
||||
# Load Lora into the pipeline.
|
||||
pipeline.load_lora_weights(Lora)
|
||||
|
||||
|
||||
# Search TextualInversion
|
||||
TextualInversion = search_civitai(
|
||||
"EasyNegative",
|
||||
model_type="TextualInversion",
|
||||
base_model = "SD 1.5",
|
||||
download=True
|
||||
)
|
||||
# Load TextualInversion into the pipeline.
|
||||
pipeline.load_textual_inversion(TextualInversion, token="EasyNegative")
|
||||
```
|
||||
|
||||
### Search Civitai
|
||||
|
||||
> [!TIP]
|
||||
> **If an error occurs, insert the `token` and run again.**
|
||||
|
||||
#### `EasyPipeline.from_civitai` parameters
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
|:---------------:|:----------------------:|:-------------:|:-----------------------------------------------------------------------------------:|
|
||||
| search_word | string, Path | ー | The search query string. Can be a keyword, Civitai URL, local directory or file path. |
|
||||
| model_type | string | `Checkpoint` | The type of model to search for. <br>(for example `Checkpoint`, `TextualInversion`, `Controlnet`, `LORA`, `Hypernetwork`, `AestheticGradient`, `Poses`) |
|
||||
| base_model | string | None | Trained model tag (for example `SD 1.5`, `SD 3.5`, `SDXL 1.0`) |
|
||||
| torch_dtype | string, torch.dtype | None | Override the default `torch.dtype` and load the model with another dtype. |
|
||||
| force_download | bool | False | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
|
||||
| cache_dir | string, Path | None | Path to the folder where cached files are stored. |
|
||||
| resume | bool | False | Whether to resume an incomplete download. |
|
||||
| token | string | None | API token for Civitai authentication. |
|
||||
|
||||
|
||||
#### `search_civitai` parameters
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
|:---------------:|:--------------:|:-------------:|:-----------------------------------------------------------------------------------:|
|
||||
| search_word | string, Path | ー | The search query string. Can be a keyword, Civitai URL, local directory or file path. |
|
||||
| model_type | string | `Checkpoint` | The type of model to search for. <br>(for example `Checkpoint`, `TextualInversion`, `Controlnet`, `LORA`, `Hypernetwork`, `AestheticGradient`, `Poses`) |
|
||||
| base_model | string | None | Trained model tag (for example `SD 1.5`, `SD 3.5`, `SDXL 1.0`) |
|
||||
| download | bool | False | Whether to download the model. |
|
||||
| force_download | bool | False | Whether to force the download if the model already exists. |
|
||||
| cache_dir | string, Path | None | Path to the folder where cached files are stored. |
|
||||
| resume | bool | False | Whether to resume an incomplete download. |
|
||||
| token | string | None | API token for Civitai authentication. |
|
||||
| include_params | bool | False | Whether to include parameters in the returned data. |
|
||||
| skip_error | bool | False | Whether to skip errors and return None. |
|
||||
|
||||
### Search Huggingface
|
||||
|
||||
> [!TIP]
|
||||
> **If an error occurs, insert the `token` and run again.**
|
||||
|
||||
#### `EasyPipeline.from_huggingface` parameters
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
|:---------------------:|:-------------------:|:--------------:|:----------------------------------------------------------------:|
|
||||
| search_word | string, Path | ー | The search query string. Can be a keyword, Hugging Face URL, local directory or file path, or a Hugging Face path (`<creator>/<repo>`). |
|
||||
| checkpoint_format | string | `single_file` | The format of the model checkpoint.<br>● `single_file` to search for `single file checkpoint` <br>●`diffusers` to search for `multifolder diffusers format checkpoint` |
|
||||
| torch_dtype | string, torch.dtype | None | Override the default `torch.dtype` and load the model with another dtype. |
|
||||
| force_download | bool | False | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
|
||||
| cache_dir | string, Path | None | Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. |
|
||||
| token | string, bool | None | The token to use as HTTP bearer authorization for remote files. |
|
||||
|
||||
|
||||
#### `search_huggingface` parameters
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
|:---------------------:|:-------------------:|:--------------:|:----------------------------------------------------------------:|
|
||||
| search_word | string, Path | ー | The search query string. Can be a keyword, Hugging Face URL, local directory or file path, or a Hugging Face path (`<creator>/<repo>`). |
|
||||
| checkpoint_format | string | `single_file` | The format of the model checkpoint. <br>● `single_file` to search for `single file checkpoint` <br>●`diffusers` to search for `multifolder diffusers format checkpoint` |
|
||||
| pipeline_tag | string | None | Tag to filter models by pipeline. |
|
||||
| download | bool | False | Whether to download the model. |
|
||||
| force_download | bool | False | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
|
||||
| cache_dir | string, Path | None | Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. |
|
||||
| token | string, bool | None | The token to use as HTTP bearer authorization for remote files. |
|
||||
| include_params | bool | False | Whether to include parameters in the returned data. |
|
||||
| skip_error | bool | False | Whether to skip errors and return None. |
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1 +0,0 @@
|
||||
huggingface-hub>=0.26.2
|
||||
@@ -1,226 +0,0 @@
|
||||
# IP Adapter Training Example
|
||||
|
||||
[IP Adapter](https://arxiv.org/abs/2308.06721) is a novel approach designed to enhance text-to-image models such as Stable Diffusion by enabling them to generate images based on image prompts rather than text prompts alone. Unlike traditional methods that rely solely on complex text prompts, IP Adapter introduces the concept of using image prompts, leveraging the idea that "an image is worth a thousand words." By decoupling cross-attention layers for text and image features, IP Adapter effectively integrates image prompts into the generation process without the need for extensive fine-tuning or large computing resources.
|
||||
|
||||
## Training locally with PyTorch
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
**Important**
|
||||
|
||||
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Then cd in the example folder and run
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Or for a default accelerate configuration without answering questions about your environment
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell e.g. a notebook
|
||||
|
||||
```python
|
||||
from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
Certainly! Below is the documentation in pure Markdown format:
|
||||
|
||||
### Accelerate Launch Command Documentation
|
||||
|
||||
#### Description:
|
||||
The Accelerate launch command is used to train a model using multiple GPUs and mixed precision training. It launches the training script `tutorial_train_ip-adapter.py` with specified parameters and configurations.
|
||||
|
||||
#### Usage Example:
|
||||
|
||||
```
|
||||
accelerate launch --mixed_precision "fp16" \
|
||||
tutorial_train_ip-adapter.py \
|
||||
--pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5/" \
|
||||
--image_encoder_path="{image_encoder_path}" \
|
||||
--data_json_file="{data.json}" \
|
||||
--data_root_path="{image_path}" \
|
||||
--mixed_precision="fp16" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=8 \
|
||||
--dataloader_num_workers=4 \
|
||||
--learning_rate=1e-04 \
|
||||
--weight_decay=0.01 \
|
||||
--output_dir="{output_dir}" \
|
||||
--save_steps=10000
|
||||
```
|
||||
|
||||
### Multi-GPU Script:
|
||||
```
|
||||
accelerate launch --num_processes 8 --multi_gpu --mixed_precision "fp16" \
|
||||
tutorial_train_ip-adapter.py \
|
||||
--pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5/" \
|
||||
--image_encoder_path="{image_encoder_path}" \
|
||||
--data_json_file="{data.json}" \
|
||||
--data_root_path="{image_path}" \
|
||||
--mixed_precision="fp16" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=8 \
|
||||
--dataloader_num_workers=4 \
|
||||
--learning_rate=1e-04 \
|
||||
--weight_decay=0.01 \
|
||||
--output_dir="{output_dir}" \
|
||||
--save_steps=10000
|
||||
```
|
||||
|
||||
#### Parameters:
|
||||
- `--num_processes`: Number of processes to launch for distributed training (in this example, 8 processes).
|
||||
- `--multi_gpu`: Flag indicating the usage of multiple GPUs for training.
|
||||
- `--mixed_precision "fp16"`: Enables mixed precision training with 16-bit floating-point precision.
|
||||
- `tutorial_train_ip-adapter.py`: Name of the training script to be executed.
|
||||
- `--pretrained_model_name_or_path`: Path or identifier for a pretrained model.
|
||||
- `--image_encoder_path`: Path to the CLIP image encoder.
|
||||
- `--data_json_file`: Path to the training data in JSON format.
|
||||
- `--data_root_path`: Root path where training images are located.
|
||||
- `--resolution`: Resolution of input images (512x512 in this example).
|
||||
- `--train_batch_size`: Batch size for training data (8 in this example).
|
||||
- `--dataloader_num_workers`: Number of subprocesses for data loading (4 in this example).
|
||||
- `--learning_rate`: Learning rate for training (1e-04 in this example).
|
||||
- `--weight_decay`: Weight decay for regularization (0.01 in this example).
|
||||
- `--output_dir`: Directory to save model checkpoints and predictions.
|
||||
- `--save_steps`: Frequency of saving checkpoints during training (10000 in this example).
|
||||
|
||||
### Inference
|
||||
|
||||
#### Description:
|
||||
The provided inference code is used to load a trained model checkpoint and extract the components related to image projection and IP (Image Processing) adapter. These components are then saved into a binary file for later use in inference.
|
||||
|
||||
#### Usage Example:
|
||||
```python
|
||||
from safetensors.torch import load_file, save_file
|
||||
|
||||
# Load the trained model checkpoint in safetensors format
|
||||
ckpt = "checkpoint-50000/pytorch_model.safetensors"
|
||||
sd = load_file(ckpt) # Using safetensors load function
|
||||
|
||||
# Extract image projection and IP adapter components
|
||||
image_proj_sd = {}
|
||||
ip_sd = {}
|
||||
|
||||
for k in sd:
|
||||
if k.startswith("unet"):
|
||||
pass # Skip unet-related keys
|
||||
elif k.startswith("image_proj_model"):
|
||||
image_proj_sd[k.replace("image_proj_model.", "")] = sd[k]
|
||||
elif k.startswith("adapter_modules"):
|
||||
ip_sd[k.replace("adapter_modules.", "")] = sd[k]
|
||||
|
||||
# Save the components into separate safetensors files
|
||||
save_file(image_proj_sd, "image_proj.safetensors")
|
||||
save_file(ip_sd, "ip_adapter.safetensors")
|
||||
```
|
||||
|
||||
### Sample Inference Script using the CLIP Model
|
||||
|
||||
```python
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from transformers import CLIPProcessor, CLIPModel # Using the Hugging Face CLIP model
|
||||
|
||||
# Load model components from safetensors
|
||||
image_proj_ckpt = "image_proj.safetensors"
|
||||
ip_adapter_ckpt = "ip_adapter.safetensors"
|
||||
|
||||
# Load the saved weights
|
||||
image_proj_sd = load_file(image_proj_ckpt)
|
||||
ip_adapter_sd = load_file(ip_adapter_ckpt)
|
||||
|
||||
# Define the model Parameters
|
||||
class ImageProjectionModel(torch.nn.Module):
|
||||
def __init__(self, input_dim=768, output_dim=512): # CLIP's default embedding size is 768
|
||||
super().__init__()
|
||||
self.model = torch.nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
class IPAdapterModel(torch.nn.Module):
|
||||
def __init__(self, input_dim=512, output_dim=10): # Example for 10 classes
|
||||
super().__init__()
|
||||
self.model = torch.nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
# Initialize models
|
||||
image_proj_model = ImageProjectionModel()
|
||||
ip_adapter_model = IPAdapterModel()
|
||||
|
||||
# Load weights into models
|
||||
image_proj_model.load_state_dict(image_proj_sd)
|
||||
ip_adapter_model.load_state_dict(ip_adapter_sd)
|
||||
|
||||
# Set models to evaluation mode
|
||||
image_proj_model.eval()
|
||||
ip_adapter_model.eval()
|
||||
|
||||
#Inference pipeline
|
||||
def inference(image_tensor):
|
||||
"""
|
||||
Run inference using the loaded models.
|
||||
|
||||
Args:
|
||||
image_tensor: Preprocessed image tensor from CLIPProcessor
|
||||
|
||||
Returns:
|
||||
Final inference results
|
||||
"""
|
||||
with torch.no_grad():
|
||||
# Step 1: Project the image features
|
||||
image_proj = image_proj_model(image_tensor)
|
||||
|
||||
# Step 2: Pass the projected features through the IP Adapter
|
||||
result = ip_adapter_model(image_proj)
|
||||
|
||||
return result
|
||||
|
||||
# Using CLIP for image preprocessing
|
||||
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||
|
||||
#Image file path
|
||||
image_path = "path/to/image.jpg"
|
||||
|
||||
# Preprocess the image
|
||||
inputs = processor(images=image_path, return_tensors="pt")
|
||||
image_features = clip_model.get_image_features(inputs["pixel_values"])
|
||||
|
||||
# Normalize the image features as per CLIP's recommendations
|
||||
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Run inference
|
||||
output = inference(image_features)
|
||||
print("Inference output:", output)
|
||||
```
|
||||
|
||||
#### Parameters:
|
||||
- `ckpt`: Path to the trained model checkpoint file.
|
||||
- `map_location="cpu"`: Specifies that the model should be loaded onto the CPU.
|
||||
- `image_proj_sd`: Dictionary to store the components related to image projection.
|
||||
- `ip_sd`: Dictionary to store the components related to the IP adapter.
|
||||
- `"unet"`, `"image_proj_model"`, `"adapter_modules"`: Prefixes indicating components of the model.
|
||||
@@ -1,4 +0,0 @@
|
||||
accelerate
|
||||
torchvision
|
||||
transformers>=4.25.1
|
||||
ip_adapter
|
||||
@@ -1,415 +0,0 @@
|
||||
import argparse
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import ProjectConfiguration
|
||||
from ip_adapter.attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor
|
||||
from ip_adapter.ip_adapter_faceid import MLPProjModel
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from transformers import CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
|
||||
|
||||
|
||||
# Dataset
|
||||
class MyDataset(torch.utils.data.Dataset):
|
||||
def __init__(
|
||||
self, json_file, tokenizer, size=512, t_drop_rate=0.05, i_drop_rate=0.05, ti_drop_rate=0.05, image_root_path=""
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.size = size
|
||||
self.i_drop_rate = i_drop_rate
|
||||
self.t_drop_rate = t_drop_rate
|
||||
self.ti_drop_rate = ti_drop_rate
|
||||
self.image_root_path = image_root_path
|
||||
|
||||
self.data = json.load(
|
||||
open(json_file)
|
||||
) # list of dict: [{"image_file": "1.png", "id_embed_file": "faceid.bin"}]
|
||||
|
||||
self.transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(self.size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.CenterCrop(self.size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = self.data[idx]
|
||||
text = item["text"]
|
||||
image_file = item["image_file"]
|
||||
|
||||
# read image
|
||||
raw_image = Image.open(os.path.join(self.image_root_path, image_file))
|
||||
image = self.transform(raw_image.convert("RGB"))
|
||||
|
||||
face_id_embed = torch.load(item["id_embed_file"], map_location="cpu")
|
||||
face_id_embed = torch.from_numpy(face_id_embed)
|
||||
|
||||
# drop
|
||||
drop_image_embed = 0
|
||||
rand_num = random.random()
|
||||
if rand_num < self.i_drop_rate:
|
||||
drop_image_embed = 1
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate):
|
||||
text = ""
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate + self.ti_drop_rate):
|
||||
text = ""
|
||||
drop_image_embed = 1
|
||||
if drop_image_embed:
|
||||
face_id_embed = torch.zeros_like(face_id_embed)
|
||||
# get text and tokenize
|
||||
text_input_ids = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"text_input_ids": text_input_ids,
|
||||
"face_id_embed": face_id_embed,
|
||||
"drop_image_embed": drop_image_embed,
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
||||
def collate_fn(data):
|
||||
images = torch.stack([example["image"] for example in data])
|
||||
text_input_ids = torch.cat([example["text_input_ids"] for example in data], dim=0)
|
||||
face_id_embed = torch.stack([example["face_id_embed"] for example in data])
|
||||
drop_image_embeds = [example["drop_image_embed"] for example in data]
|
||||
|
||||
return {
|
||||
"images": images,
|
||||
"text_input_ids": text_input_ids,
|
||||
"face_id_embed": face_id_embed,
|
||||
"drop_image_embeds": drop_image_embeds,
|
||||
}
|
||||
|
||||
|
||||
class IPAdapter(torch.nn.Module):
|
||||
"""IP-Adapter"""
|
||||
|
||||
def __init__(self, unet, image_proj_model, adapter_modules, ckpt_path=None):
|
||||
super().__init__()
|
||||
self.unet = unet
|
||||
self.image_proj_model = image_proj_model
|
||||
self.adapter_modules = adapter_modules
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.load_from_checkpoint(ckpt_path)
|
||||
|
||||
def forward(self, noisy_latents, timesteps, encoder_hidden_states, image_embeds):
|
||||
ip_tokens = self.image_proj_model(image_embeds)
|
||||
encoder_hidden_states = torch.cat([encoder_hidden_states, ip_tokens], dim=1)
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
||||
return noise_pred
|
||||
|
||||
def load_from_checkpoint(self, ckpt_path: str):
|
||||
# Calculate original checksums
|
||||
orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
state_dict = torch.load(ckpt_path, map_location="cpu")
|
||||
|
||||
# Load state dict for image_proj_model and adapter_modules
|
||||
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
|
||||
self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)
|
||||
|
||||
# Calculate new checksums
|
||||
new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
# Verify if the weights have changed
|
||||
assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
|
||||
assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"
|
||||
|
||||
print(f"Successfully loaded weights from checkpoint {ckpt_path}")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_ip_adapter_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to pretrained ip adapter model. If not specified weights are initialized randomly.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_json_file",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Training data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_root_path",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Training data root path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_encoder_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to CLIP image encoder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="sd-ip_adapter",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=("The resolution for input images"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Learning rate to use.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataloader_num_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_steps",
|
||||
type=int,
|
||||
default=2000,
|
||||
help=("Save a checkpoint of the training state every X updates"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
|
||||
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
|
||||
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="tensorboard",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
|
||||
' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
mixed_precision=args.mixed_precision,
|
||||
log_with=args.report_to,
|
||||
project_config=accelerator_project_config,
|
||||
)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
if args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Load scheduler, tokenizer and models.
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
||||
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
# image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path)
|
||||
# freeze parameters of models to save more memory
|
||||
unet.requires_grad_(False)
|
||||
vae.requires_grad_(False)
|
||||
text_encoder.requires_grad_(False)
|
||||
# image_encoder.requires_grad_(False)
|
||||
|
||||
# ip-adapter
|
||||
image_proj_model = MLPProjModel(
|
||||
cross_attention_dim=unet.config.cross_attention_dim,
|
||||
id_embeddings_dim=512,
|
||||
num_tokens=4,
|
||||
)
|
||||
# init adapter modules
|
||||
lora_rank = 128
|
||||
attn_procs = {}
|
||||
unet_sd = unet.state_dict()
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
if cross_attention_dim is None:
|
||||
attn_procs[name] = LoRAAttnProcessor(
|
||||
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
|
||||
)
|
||||
else:
|
||||
layer_name = name.split(".processor")[0]
|
||||
weights = {
|
||||
"to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
|
||||
"to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
|
||||
}
|
||||
attn_procs[name] = LoRAIPAttnProcessor(
|
||||
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
|
||||
)
|
||||
attn_procs[name].load_state_dict(weights, strict=False)
|
||||
unet.set_attn_processor(attn_procs)
|
||||
adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
|
||||
|
||||
ip_adapter = IPAdapter(unet, image_proj_model, adapter_modules, args.pretrained_ip_adapter_path)
|
||||
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
# unet.to(accelerator.device, dtype=weight_dtype)
|
||||
vae.to(accelerator.device, dtype=weight_dtype)
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
# image_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# optimizer
|
||||
params_to_opt = itertools.chain(ip_adapter.image_proj_model.parameters(), ip_adapter.adapter_modules.parameters())
|
||||
optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
|
||||
|
||||
# dataloader
|
||||
train_dataset = MyDataset(
|
||||
args.data_json_file, tokenizer=tokenizer, size=args.resolution, image_root_path=args.data_root_path
|
||||
)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.train_batch_size,
|
||||
num_workers=args.dataloader_num_workers,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
ip_adapter, optimizer, train_dataloader = accelerator.prepare(ip_adapter, optimizer, train_dataloader)
|
||||
|
||||
global_step = 0
|
||||
for epoch in range(0, args.num_train_epochs):
|
||||
begin = time.perf_counter()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
load_data_time = time.perf_counter() - begin
|
||||
with accelerator.accumulate(ip_adapter):
|
||||
# Convert images to latent space
|
||||
with torch.no_grad():
|
||||
latents = vae.encode(
|
||||
batch["images"].to(accelerator.device, dtype=weight_dtype)
|
||||
).latent_dist.sample()
|
||||
latents = latents * vae.config.scaling_factor
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(latents)
|
||||
bsz = latents.shape[0]
|
||||
# Sample a random timestep for each image
|
||||
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
image_embeds = batch["face_id_embed"].to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
with torch.no_grad():
|
||||
encoder_hidden_states = text_encoder(batch["text_input_ids"].to(accelerator.device))[0]
|
||||
|
||||
noise_pred = ip_adapter(noisy_latents, timesteps, encoder_hidden_states, image_embeds)
|
||||
|
||||
loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
|
||||
|
||||
# Gather the losses across all processes for logging (if we use distributed training).
|
||||
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
|
||||
|
||||
# Backpropagate
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
print(
|
||||
"Epoch {}, step {}, data_time: {}, time: {}, step_loss: {}".format(
|
||||
epoch, step, load_data_time, time.perf_counter() - begin, avg_loss
|
||||
)
|
||||
)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.save_steps == 0:
|
||||
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
accelerator.save_state(save_path)
|
||||
|
||||
begin = time.perf_counter()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,422 +0,0 @@
|
||||
import argparse
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import ProjectConfiguration
|
||||
from ip_adapter.ip_adapter import ImageProjModel
|
||||
from ip_adapter.utils import is_torch2_available
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
|
||||
|
||||
|
||||
if is_torch2_available():
|
||||
from ip_adapter.attention_processor import AttnProcessor2_0 as AttnProcessor
|
||||
from ip_adapter.attention_processor import IPAttnProcessor2_0 as IPAttnProcessor
|
||||
else:
|
||||
from ip_adapter.attention_processor import AttnProcessor, IPAttnProcessor
|
||||
|
||||
|
||||
# Dataset
|
||||
class MyDataset(torch.utils.data.Dataset):
|
||||
def __init__(
|
||||
self, json_file, tokenizer, size=512, t_drop_rate=0.05, i_drop_rate=0.05, ti_drop_rate=0.05, image_root_path=""
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.size = size
|
||||
self.i_drop_rate = i_drop_rate
|
||||
self.t_drop_rate = t_drop_rate
|
||||
self.ti_drop_rate = ti_drop_rate
|
||||
self.image_root_path = image_root_path
|
||||
|
||||
self.data = json.load(open(json_file)) # list of dict: [{"image_file": "1.png", "text": "A dog"}]
|
||||
|
||||
self.transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(self.size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.CenterCrop(self.size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
self.clip_image_processor = CLIPImageProcessor()
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = self.data[idx]
|
||||
text = item["text"]
|
||||
image_file = item["image_file"]
|
||||
|
||||
# read image
|
||||
raw_image = Image.open(os.path.join(self.image_root_path, image_file))
|
||||
image = self.transform(raw_image.convert("RGB"))
|
||||
clip_image = self.clip_image_processor(images=raw_image, return_tensors="pt").pixel_values
|
||||
|
||||
# drop
|
||||
drop_image_embed = 0
|
||||
rand_num = random.random()
|
||||
if rand_num < self.i_drop_rate:
|
||||
drop_image_embed = 1
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate):
|
||||
text = ""
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate + self.ti_drop_rate):
|
||||
text = ""
|
||||
drop_image_embed = 1
|
||||
# get text and tokenize
|
||||
text_input_ids = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"text_input_ids": text_input_ids,
|
||||
"clip_image": clip_image,
|
||||
"drop_image_embed": drop_image_embed,
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
||||
def collate_fn(data):
|
||||
images = torch.stack([example["image"] for example in data])
|
||||
text_input_ids = torch.cat([example["text_input_ids"] for example in data], dim=0)
|
||||
clip_images = torch.cat([example["clip_image"] for example in data], dim=0)
|
||||
drop_image_embeds = [example["drop_image_embed"] for example in data]
|
||||
|
||||
return {
|
||||
"images": images,
|
||||
"text_input_ids": text_input_ids,
|
||||
"clip_images": clip_images,
|
||||
"drop_image_embeds": drop_image_embeds,
|
||||
}
|
||||
|
||||
|
||||
class IPAdapter(torch.nn.Module):
|
||||
"""IP-Adapter"""
|
||||
|
||||
def __init__(self, unet, image_proj_model, adapter_modules, ckpt_path=None):
|
||||
super().__init__()
|
||||
self.unet = unet
|
||||
self.image_proj_model = image_proj_model
|
||||
self.adapter_modules = adapter_modules
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.load_from_checkpoint(ckpt_path)
|
||||
|
||||
def forward(self, noisy_latents, timesteps, encoder_hidden_states, image_embeds):
|
||||
ip_tokens = self.image_proj_model(image_embeds)
|
||||
encoder_hidden_states = torch.cat([encoder_hidden_states, ip_tokens], dim=1)
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
||||
return noise_pred
|
||||
|
||||
def load_from_checkpoint(self, ckpt_path: str):
|
||||
# Calculate original checksums
|
||||
orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
state_dict = torch.load(ckpt_path, map_location="cpu")
|
||||
|
||||
# Load state dict for image_proj_model and adapter_modules
|
||||
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
|
||||
self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)
|
||||
|
||||
# Calculate new checksums
|
||||
new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
# Verify if the weights have changed
|
||||
assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
|
||||
assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"
|
||||
|
||||
print(f"Successfully loaded weights from checkpoint {ckpt_path}")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_ip_adapter_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to pretrained ip adapter model. If not specified weights are initialized randomly.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_json_file",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Training data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_root_path",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Training data root path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_encoder_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to CLIP image encoder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="sd-ip_adapter",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=("The resolution for input images"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Learning rate to use.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataloader_num_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_steps",
|
||||
type=int,
|
||||
default=2000,
|
||||
help=("Save a checkpoint of the training state every X updates"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
|
||||
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
|
||||
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="tensorboard",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
|
||||
' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
mixed_precision=args.mixed_precision,
|
||||
log_with=args.report_to,
|
||||
project_config=accelerator_project_config,
|
||||
)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
if args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Load scheduler, tokenizer and models.
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
||||
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path)
|
||||
# freeze parameters of models to save more memory
|
||||
unet.requires_grad_(False)
|
||||
vae.requires_grad_(False)
|
||||
text_encoder.requires_grad_(False)
|
||||
image_encoder.requires_grad_(False)
|
||||
|
||||
# ip-adapter
|
||||
image_proj_model = ImageProjModel(
|
||||
cross_attention_dim=unet.config.cross_attention_dim,
|
||||
clip_embeddings_dim=image_encoder.config.projection_dim,
|
||||
clip_extra_context_tokens=4,
|
||||
)
|
||||
# init adapter modules
|
||||
attn_procs = {}
|
||||
unet_sd = unet.state_dict()
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
if cross_attention_dim is None:
|
||||
attn_procs[name] = AttnProcessor()
|
||||
else:
|
||||
layer_name = name.split(".processor")[0]
|
||||
weights = {
|
||||
"to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
|
||||
"to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
|
||||
}
|
||||
attn_procs[name] = IPAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
|
||||
attn_procs[name].load_state_dict(weights)
|
||||
unet.set_attn_processor(attn_procs)
|
||||
adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
|
||||
|
||||
ip_adapter = IPAdapter(unet, image_proj_model, adapter_modules, args.pretrained_ip_adapter_path)
|
||||
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
# unet.to(accelerator.device, dtype=weight_dtype)
|
||||
vae.to(accelerator.device, dtype=weight_dtype)
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
image_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# optimizer
|
||||
params_to_opt = itertools.chain(ip_adapter.image_proj_model.parameters(), ip_adapter.adapter_modules.parameters())
|
||||
optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
|
||||
|
||||
# dataloader
|
||||
train_dataset = MyDataset(
|
||||
args.data_json_file, tokenizer=tokenizer, size=args.resolution, image_root_path=args.data_root_path
|
||||
)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.train_batch_size,
|
||||
num_workers=args.dataloader_num_workers,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
ip_adapter, optimizer, train_dataloader = accelerator.prepare(ip_adapter, optimizer, train_dataloader)
|
||||
|
||||
global_step = 0
|
||||
for epoch in range(0, args.num_train_epochs):
|
||||
begin = time.perf_counter()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
load_data_time = time.perf_counter() - begin
|
||||
with accelerator.accumulate(ip_adapter):
|
||||
# Convert images to latent space
|
||||
with torch.no_grad():
|
||||
latents = vae.encode(
|
||||
batch["images"].to(accelerator.device, dtype=weight_dtype)
|
||||
).latent_dist.sample()
|
||||
latents = latents * vae.config.scaling_factor
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(latents)
|
||||
bsz = latents.shape[0]
|
||||
# Sample a random timestep for each image
|
||||
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
with torch.no_grad():
|
||||
image_embeds = image_encoder(
|
||||
batch["clip_images"].to(accelerator.device, dtype=weight_dtype)
|
||||
).image_embeds
|
||||
image_embeds_ = []
|
||||
for image_embed, drop_image_embed in zip(image_embeds, batch["drop_image_embeds"]):
|
||||
if drop_image_embed == 1:
|
||||
image_embeds_.append(torch.zeros_like(image_embed))
|
||||
else:
|
||||
image_embeds_.append(image_embed)
|
||||
image_embeds = torch.stack(image_embeds_)
|
||||
|
||||
with torch.no_grad():
|
||||
encoder_hidden_states = text_encoder(batch["text_input_ids"].to(accelerator.device))[0]
|
||||
|
||||
noise_pred = ip_adapter(noisy_latents, timesteps, encoder_hidden_states, image_embeds)
|
||||
|
||||
loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
|
||||
|
||||
# Gather the losses across all processes for logging (if we use distributed training).
|
||||
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
|
||||
|
||||
# Backpropagate
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
print(
|
||||
"Epoch {}, step {}, data_time: {}, time: {}, step_loss: {}".format(
|
||||
epoch, step, load_data_time, time.perf_counter() - begin, avg_loss
|
||||
)
|
||||
)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.save_steps == 0:
|
||||
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
accelerator.save_state(save_path)
|
||||
|
||||
begin = time.perf_counter()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,445 +0,0 @@
|
||||
import argparse
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import ProjectConfiguration
|
||||
from ip_adapter.resampler import Resampler
|
||||
from ip_adapter.utils import is_torch2_available
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
|
||||
|
||||
|
||||
if is_torch2_available():
|
||||
from ip_adapter.attention_processor import AttnProcessor2_0 as AttnProcessor
|
||||
from ip_adapter.attention_processor import IPAttnProcessor2_0 as IPAttnProcessor
|
||||
else:
|
||||
from ip_adapter.attention_processor import AttnProcessor, IPAttnProcessor
|
||||
|
||||
|
||||
# Dataset
|
||||
class MyDataset(torch.utils.data.Dataset):
|
||||
def __init__(
|
||||
self, json_file, tokenizer, size=512, t_drop_rate=0.05, i_drop_rate=0.05, ti_drop_rate=0.05, image_root_path=""
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.size = size
|
||||
self.i_drop_rate = i_drop_rate
|
||||
self.t_drop_rate = t_drop_rate
|
||||
self.ti_drop_rate = ti_drop_rate
|
||||
self.image_root_path = image_root_path
|
||||
|
||||
self.data = json.load(open(json_file)) # list of dict: [{"image_file": "1.png", "text": "A dog"}]
|
||||
|
||||
self.transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(self.size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.CenterCrop(self.size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
self.clip_image_processor = CLIPImageProcessor()
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = self.data[idx]
|
||||
text = item["text"]
|
||||
image_file = item["image_file"]
|
||||
|
||||
# read image
|
||||
raw_image = Image.open(os.path.join(self.image_root_path, image_file))
|
||||
image = self.transform(raw_image.convert("RGB"))
|
||||
clip_image = self.clip_image_processor(images=raw_image, return_tensors="pt").pixel_values
|
||||
|
||||
# drop
|
||||
drop_image_embed = 0
|
||||
rand_num = random.random()
|
||||
if rand_num < self.i_drop_rate:
|
||||
drop_image_embed = 1
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate):
|
||||
text = ""
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate + self.ti_drop_rate):
|
||||
text = ""
|
||||
drop_image_embed = 1
|
||||
# get text and tokenize
|
||||
text_input_ids = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"text_input_ids": text_input_ids,
|
||||
"clip_image": clip_image,
|
||||
"drop_image_embed": drop_image_embed,
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
||||
def collate_fn(data):
|
||||
images = torch.stack([example["image"] for example in data])
|
||||
text_input_ids = torch.cat([example["text_input_ids"] for example in data], dim=0)
|
||||
clip_images = torch.cat([example["clip_image"] for example in data], dim=0)
|
||||
drop_image_embeds = [example["drop_image_embed"] for example in data]
|
||||
|
||||
return {
|
||||
"images": images,
|
||||
"text_input_ids": text_input_ids,
|
||||
"clip_images": clip_images,
|
||||
"drop_image_embeds": drop_image_embeds,
|
||||
}
|
||||
|
||||
|
||||
class IPAdapter(torch.nn.Module):
|
||||
"""IP-Adapter"""
|
||||
|
||||
def __init__(self, unet, image_proj_model, adapter_modules, ckpt_path=None):
|
||||
super().__init__()
|
||||
self.unet = unet
|
||||
self.image_proj_model = image_proj_model
|
||||
self.adapter_modules = adapter_modules
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.load_from_checkpoint(ckpt_path)
|
||||
|
||||
def forward(self, noisy_latents, timesteps, encoder_hidden_states, image_embeds):
|
||||
ip_tokens = self.image_proj_model(image_embeds)
|
||||
encoder_hidden_states = torch.cat([encoder_hidden_states, ip_tokens], dim=1)
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
||||
return noise_pred
|
||||
|
||||
def load_from_checkpoint(self, ckpt_path: str):
|
||||
# Calculate original checksums
|
||||
orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
state_dict = torch.load(ckpt_path, map_location="cpu")
|
||||
|
||||
# Check if 'latents' exists in both the saved state_dict and the current model's state_dict
|
||||
strict_load_image_proj_model = True
|
||||
if "latents" in state_dict["image_proj"] and "latents" in self.image_proj_model.state_dict():
|
||||
# Check if the shapes are mismatched
|
||||
if state_dict["image_proj"]["latents"].shape != self.image_proj_model.state_dict()["latents"].shape:
|
||||
print(f"Shapes of 'image_proj.latents' in checkpoint {ckpt_path} and current model do not match.")
|
||||
print("Removing 'latents' from checkpoint and loading the rest of the weights.")
|
||||
del state_dict["image_proj"]["latents"]
|
||||
strict_load_image_proj_model = False
|
||||
|
||||
# Load state dict for image_proj_model and adapter_modules
|
||||
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=strict_load_image_proj_model)
|
||||
self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)
|
||||
|
||||
# Calculate new checksums
|
||||
new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
# Verify if the weights have changed
|
||||
assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
|
||||
assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"
|
||||
|
||||
print(f"Successfully loaded weights from checkpoint {ckpt_path}")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_ip_adapter_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to pretrained ip adapter model. If not specified weights are initialized randomly.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_tokens",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Number of tokens to query from the CLIP image encoding.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_json_file",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Training data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_root_path",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Training data root path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_encoder_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to CLIP image encoder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="sd-ip_adapter",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=("The resolution for input images"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Learning rate to use.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataloader_num_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_steps",
|
||||
type=int,
|
||||
default=2000,
|
||||
help=("Save a checkpoint of the training state every X updates"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
|
||||
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
|
||||
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="tensorboard",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
|
||||
' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
mixed_precision=args.mixed_precision,
|
||||
log_with=args.report_to,
|
||||
project_config=accelerator_project_config,
|
||||
)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
if args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Load scheduler, tokenizer and models.
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
||||
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path)
|
||||
# freeze parameters of models to save more memory
|
||||
unet.requires_grad_(False)
|
||||
vae.requires_grad_(False)
|
||||
text_encoder.requires_grad_(False)
|
||||
image_encoder.requires_grad_(False)
|
||||
|
||||
# ip-adapter-plus
|
||||
image_proj_model = Resampler(
|
||||
dim=unet.config.cross_attention_dim,
|
||||
depth=4,
|
||||
dim_head=64,
|
||||
heads=12,
|
||||
num_queries=args.num_tokens,
|
||||
embedding_dim=image_encoder.config.hidden_size,
|
||||
output_dim=unet.config.cross_attention_dim,
|
||||
ff_mult=4,
|
||||
)
|
||||
# init adapter modules
|
||||
attn_procs = {}
|
||||
unet_sd = unet.state_dict()
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
if cross_attention_dim is None:
|
||||
attn_procs[name] = AttnProcessor()
|
||||
else:
|
||||
layer_name = name.split(".processor")[0]
|
||||
weights = {
|
||||
"to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
|
||||
"to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
|
||||
}
|
||||
attn_procs[name] = IPAttnProcessor(
|
||||
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, num_tokens=args.num_tokens
|
||||
)
|
||||
attn_procs[name].load_state_dict(weights)
|
||||
unet.set_attn_processor(attn_procs)
|
||||
adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
|
||||
|
||||
ip_adapter = IPAdapter(unet, image_proj_model, adapter_modules, args.pretrained_ip_adapter_path)
|
||||
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
# unet.to(accelerator.device, dtype=weight_dtype)
|
||||
vae.to(accelerator.device, dtype=weight_dtype)
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
image_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# optimizer
|
||||
params_to_opt = itertools.chain(ip_adapter.image_proj_model.parameters(), ip_adapter.adapter_modules.parameters())
|
||||
optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
|
||||
|
||||
# dataloader
|
||||
train_dataset = MyDataset(
|
||||
args.data_json_file, tokenizer=tokenizer, size=args.resolution, image_root_path=args.data_root_path
|
||||
)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.train_batch_size,
|
||||
num_workers=args.dataloader_num_workers,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
ip_adapter, optimizer, train_dataloader = accelerator.prepare(ip_adapter, optimizer, train_dataloader)
|
||||
|
||||
global_step = 0
|
||||
for epoch in range(0, args.num_train_epochs):
|
||||
begin = time.perf_counter()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
load_data_time = time.perf_counter() - begin
|
||||
with accelerator.accumulate(ip_adapter):
|
||||
# Convert images to latent space
|
||||
with torch.no_grad():
|
||||
latents = vae.encode(
|
||||
batch["images"].to(accelerator.device, dtype=weight_dtype)
|
||||
).latent_dist.sample()
|
||||
latents = latents * vae.config.scaling_factor
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(latents)
|
||||
bsz = latents.shape[0]
|
||||
# Sample a random timestep for each image
|
||||
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
clip_images = []
|
||||
for clip_image, drop_image_embed in zip(batch["clip_images"], batch["drop_image_embeds"]):
|
||||
if drop_image_embed == 1:
|
||||
clip_images.append(torch.zeros_like(clip_image))
|
||||
else:
|
||||
clip_images.append(clip_image)
|
||||
clip_images = torch.stack(clip_images, dim=0)
|
||||
with torch.no_grad():
|
||||
image_embeds = image_encoder(
|
||||
clip_images.to(accelerator.device, dtype=weight_dtype), output_hidden_states=True
|
||||
).hidden_states[-2]
|
||||
|
||||
with torch.no_grad():
|
||||
encoder_hidden_states = text_encoder(batch["text_input_ids"].to(accelerator.device))[0]
|
||||
|
||||
noise_pred = ip_adapter(noisy_latents, timesteps, encoder_hidden_states, image_embeds)
|
||||
|
||||
loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
|
||||
|
||||
# Gather the losses across all processes for logging (if we use distributed training).
|
||||
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
|
||||
|
||||
# Backpropagate
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
print(
|
||||
"Epoch {}, step {}, data_time: {}, time: {}, step_loss: {}".format(
|
||||
epoch, step, load_data_time, time.perf_counter() - begin, avg_loss
|
||||
)
|
||||
)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.save_steps == 0:
|
||||
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
accelerator.save_state(save_path)
|
||||
|
||||
begin = time.perf_counter()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,520 +0,0 @@
|
||||
import argparse
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import ProjectConfiguration
|
||||
from ip_adapter.ip_adapter import ImageProjModel
|
||||
from ip_adapter.utils import is_torch2_available
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from transformers import (
|
||||
CLIPImageProcessor,
|
||||
CLIPTextModel,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
|
||||
|
||||
|
||||
if is_torch2_available():
|
||||
from ip_adapter.attention_processor import AttnProcessor2_0 as AttnProcessor
|
||||
from ip_adapter.attention_processor import IPAttnProcessor2_0 as IPAttnProcessor
|
||||
else:
|
||||
from ip_adapter.attention_processor import AttnProcessor, IPAttnProcessor
|
||||
|
||||
|
||||
# Dataset
|
||||
class MyDataset(torch.utils.data.Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
json_file,
|
||||
tokenizer,
|
||||
tokenizer_2,
|
||||
size=1024,
|
||||
center_crop=True,
|
||||
t_drop_rate=0.05,
|
||||
i_drop_rate=0.05,
|
||||
ti_drop_rate=0.05,
|
||||
image_root_path="",
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer_2 = tokenizer_2
|
||||
self.size = size
|
||||
self.center_crop = center_crop
|
||||
self.i_drop_rate = i_drop_rate
|
||||
self.t_drop_rate = t_drop_rate
|
||||
self.ti_drop_rate = ti_drop_rate
|
||||
self.image_root_path = image_root_path
|
||||
|
||||
self.data = json.load(open(json_file)) # list of dict: [{"image_file": "1.png", "text": "A dog"}]
|
||||
|
||||
self.transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(self.size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
|
||||
self.clip_image_processor = CLIPImageProcessor()
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = self.data[idx]
|
||||
text = item["text"]
|
||||
image_file = item["image_file"]
|
||||
|
||||
# read image
|
||||
raw_image = Image.open(os.path.join(self.image_root_path, image_file))
|
||||
|
||||
# original size
|
||||
original_width, original_height = raw_image.size
|
||||
original_size = torch.tensor([original_height, original_width])
|
||||
|
||||
image_tensor = self.transform(raw_image.convert("RGB"))
|
||||
# random crop
|
||||
delta_h = image_tensor.shape[1] - self.size
|
||||
delta_w = image_tensor.shape[2] - self.size
|
||||
assert not all([delta_h, delta_w])
|
||||
|
||||
if self.center_crop:
|
||||
top = delta_h // 2
|
||||
left = delta_w // 2
|
||||
else:
|
||||
top = np.random.randint(0, delta_h + 1)
|
||||
left = np.random.randint(0, delta_w + 1)
|
||||
image = transforms.functional.crop(image_tensor, top=top, left=left, height=self.size, width=self.size)
|
||||
crop_coords_top_left = torch.tensor([top, left])
|
||||
|
||||
clip_image = self.clip_image_processor(images=raw_image, return_tensors="pt").pixel_values
|
||||
|
||||
# drop
|
||||
drop_image_embed = 0
|
||||
rand_num = random.random()
|
||||
if rand_num < self.i_drop_rate:
|
||||
drop_image_embed = 1
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate):
|
||||
text = ""
|
||||
elif rand_num < (self.i_drop_rate + self.t_drop_rate + self.ti_drop_rate):
|
||||
text = ""
|
||||
drop_image_embed = 1
|
||||
|
||||
# get text and tokenize
|
||||
text_input_ids = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
|
||||
text_input_ids_2 = self.tokenizer_2(
|
||||
text,
|
||||
max_length=self.tokenizer_2.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"text_input_ids": text_input_ids,
|
||||
"text_input_ids_2": text_input_ids_2,
|
||||
"clip_image": clip_image,
|
||||
"drop_image_embed": drop_image_embed,
|
||||
"original_size": original_size,
|
||||
"crop_coords_top_left": crop_coords_top_left,
|
||||
"target_size": torch.tensor([self.size, self.size]),
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
||||
def collate_fn(data):
|
||||
images = torch.stack([example["image"] for example in data])
|
||||
text_input_ids = torch.cat([example["text_input_ids"] for example in data], dim=0)
|
||||
text_input_ids_2 = torch.cat([example["text_input_ids_2"] for example in data], dim=0)
|
||||
clip_images = torch.cat([example["clip_image"] for example in data], dim=0)
|
||||
drop_image_embeds = [example["drop_image_embed"] for example in data]
|
||||
original_size = torch.stack([example["original_size"] for example in data])
|
||||
crop_coords_top_left = torch.stack([example["crop_coords_top_left"] for example in data])
|
||||
target_size = torch.stack([example["target_size"] for example in data])
|
||||
|
||||
return {
|
||||
"images": images,
|
||||
"text_input_ids": text_input_ids,
|
||||
"text_input_ids_2": text_input_ids_2,
|
||||
"clip_images": clip_images,
|
||||
"drop_image_embeds": drop_image_embeds,
|
||||
"original_size": original_size,
|
||||
"crop_coords_top_left": crop_coords_top_left,
|
||||
"target_size": target_size,
|
||||
}
|
||||
|
||||
|
||||
class IPAdapter(torch.nn.Module):
|
||||
"""IP-Adapter"""
|
||||
|
||||
def __init__(self, unet, image_proj_model, adapter_modules, ckpt_path=None):
|
||||
super().__init__()
|
||||
self.unet = unet
|
||||
self.image_proj_model = image_proj_model
|
||||
self.adapter_modules = adapter_modules
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.load_from_checkpoint(ckpt_path)
|
||||
|
||||
def forward(self, noisy_latents, timesteps, encoder_hidden_states, unet_added_cond_kwargs, image_embeds):
|
||||
ip_tokens = self.image_proj_model(image_embeds)
|
||||
encoder_hidden_states = torch.cat([encoder_hidden_states, ip_tokens], dim=1)
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(
|
||||
noisy_latents, timesteps, encoder_hidden_states, added_cond_kwargs=unet_added_cond_kwargs
|
||||
).sample
|
||||
return noise_pred
|
||||
|
||||
def load_from_checkpoint(self, ckpt_path: str):
|
||||
# Calculate original checksums
|
||||
orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
state_dict = torch.load(ckpt_path, map_location="cpu")
|
||||
|
||||
# Load state dict for image_proj_model and adapter_modules
|
||||
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
|
||||
self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)
|
||||
|
||||
# Calculate new checksums
|
||||
new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
|
||||
new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
|
||||
|
||||
# Verify if the weights have changed
|
||||
assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
|
||||
assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"
|
||||
|
||||
print(f"Successfully loaded weights from checkpoint {ckpt_path}")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_ip_adapter_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to pretrained ip adapter model. If not specified weights are initialized randomly.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_json_file",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Training data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_root_path",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Training data root path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_encoder_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to CLIP image encoder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="sd-ip_adapter",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=("The resolution for input images"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Learning rate to use.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument("--noise_offset", type=float, default=None, help="noise offset")
|
||||
parser.add_argument(
|
||||
"--dataloader_num_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_steps",
|
||||
type=int,
|
||||
default=2000,
|
||||
help=("Save a checkpoint of the training state every X updates"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
|
||||
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
|
||||
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="tensorboard",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
|
||||
' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
mixed_precision=args.mixed_precision,
|
||||
log_with=args.report_to,
|
||||
project_config=accelerator_project_config,
|
||||
)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
if args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Load scheduler, tokenizer and models.
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer_2")
|
||||
text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="text_encoder_2"
|
||||
)
|
||||
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path)
|
||||
# freeze parameters of models to save more memory
|
||||
unet.requires_grad_(False)
|
||||
vae.requires_grad_(False)
|
||||
text_encoder.requires_grad_(False)
|
||||
text_encoder_2.requires_grad_(False)
|
||||
image_encoder.requires_grad_(False)
|
||||
|
||||
# ip-adapter
|
||||
num_tokens = 4
|
||||
image_proj_model = ImageProjModel(
|
||||
cross_attention_dim=unet.config.cross_attention_dim,
|
||||
clip_embeddings_dim=image_encoder.config.projection_dim,
|
||||
clip_extra_context_tokens=num_tokens,
|
||||
)
|
||||
# init adapter modules
|
||||
attn_procs = {}
|
||||
unet_sd = unet.state_dict()
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
if cross_attention_dim is None:
|
||||
attn_procs[name] = AttnProcessor()
|
||||
else:
|
||||
layer_name = name.split(".processor")[0]
|
||||
weights = {
|
||||
"to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
|
||||
"to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
|
||||
}
|
||||
attn_procs[name] = IPAttnProcessor(
|
||||
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, num_tokens=num_tokens
|
||||
)
|
||||
attn_procs[name].load_state_dict(weights)
|
||||
unet.set_attn_processor(attn_procs)
|
||||
adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
|
||||
|
||||
ip_adapter = IPAdapter(unet, image_proj_model, adapter_modules, args.pretrained_ip_adapter_path)
|
||||
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
# unet.to(accelerator.device, dtype=weight_dtype)
|
||||
vae.to(accelerator.device) # use fp32
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
text_encoder_2.to(accelerator.device, dtype=weight_dtype)
|
||||
image_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# optimizer
|
||||
params_to_opt = itertools.chain(ip_adapter.image_proj_model.parameters(), ip_adapter.adapter_modules.parameters())
|
||||
optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
|
||||
|
||||
# dataloader
|
||||
train_dataset = MyDataset(
|
||||
args.data_json_file,
|
||||
tokenizer=tokenizer,
|
||||
tokenizer_2=tokenizer_2,
|
||||
size=args.resolution,
|
||||
image_root_path=args.data_root_path,
|
||||
)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.train_batch_size,
|
||||
num_workers=args.dataloader_num_workers,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
ip_adapter, optimizer, train_dataloader = accelerator.prepare(ip_adapter, optimizer, train_dataloader)
|
||||
|
||||
global_step = 0
|
||||
for epoch in range(0, args.num_train_epochs):
|
||||
begin = time.perf_counter()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
load_data_time = time.perf_counter() - begin
|
||||
with accelerator.accumulate(ip_adapter):
|
||||
# Convert images to latent space
|
||||
with torch.no_grad():
|
||||
# vae of sdxl should use fp32
|
||||
latents = vae.encode(batch["images"].to(accelerator.device, dtype=vae.dtype)).latent_dist.sample()
|
||||
latents = latents * vae.config.scaling_factor
|
||||
latents = latents.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(latents)
|
||||
if args.noise_offset:
|
||||
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
|
||||
noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1)).to(
|
||||
accelerator.device, dtype=weight_dtype
|
||||
)
|
||||
|
||||
bsz = latents.shape[0]
|
||||
# Sample a random timestep for each image
|
||||
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
with torch.no_grad():
|
||||
image_embeds = image_encoder(
|
||||
batch["clip_images"].to(accelerator.device, dtype=weight_dtype)
|
||||
).image_embeds
|
||||
image_embeds_ = []
|
||||
for image_embed, drop_image_embed in zip(image_embeds, batch["drop_image_embeds"]):
|
||||
if drop_image_embed == 1:
|
||||
image_embeds_.append(torch.zeros_like(image_embed))
|
||||
else:
|
||||
image_embeds_.append(image_embed)
|
||||
image_embeds = torch.stack(image_embeds_)
|
||||
|
||||
with torch.no_grad():
|
||||
encoder_output = text_encoder(
|
||||
batch["text_input_ids"].to(accelerator.device), output_hidden_states=True
|
||||
)
|
||||
text_embeds = encoder_output.hidden_states[-2]
|
||||
encoder_output_2 = text_encoder_2(
|
||||
batch["text_input_ids_2"].to(accelerator.device), output_hidden_states=True
|
||||
)
|
||||
pooled_text_embeds = encoder_output_2[0]
|
||||
text_embeds_2 = encoder_output_2.hidden_states[-2]
|
||||
text_embeds = torch.concat([text_embeds, text_embeds_2], dim=-1) # concat
|
||||
|
||||
# add cond
|
||||
add_time_ids = [
|
||||
batch["original_size"].to(accelerator.device),
|
||||
batch["crop_coords_top_left"].to(accelerator.device),
|
||||
batch["target_size"].to(accelerator.device),
|
||||
]
|
||||
add_time_ids = torch.cat(add_time_ids, dim=1).to(accelerator.device, dtype=weight_dtype)
|
||||
unet_added_cond_kwargs = {"text_embeds": pooled_text_embeds, "time_ids": add_time_ids}
|
||||
|
||||
noise_pred = ip_adapter(noisy_latents, timesteps, text_embeds, unet_added_cond_kwargs, image_embeds)
|
||||
|
||||
loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
|
||||
|
||||
# Gather the losses across all processes for logging (if we use distributed training).
|
||||
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
|
||||
|
||||
# Backpropagate
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
print(
|
||||
"Epoch {}, step {}, data_time: {}, time: {}, step_loss: {}".format(
|
||||
epoch, step, load_data_time, time.perf_counter() - begin, avg_loss
|
||||
)
|
||||
)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.save_steps == 0:
|
||||
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
accelerator.save_state(save_path)
|
||||
|
||||
begin = time.perf_counter()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -7,14 +7,13 @@ It has been tested on v4 and v5p TPU versions. Training code has been tested on
|
||||
This script implements Distributed Data Parallel using GSPMD feature in XLA compiler
|
||||
where we shard the input batches over the TPU devices.
|
||||
|
||||
As of 10-31-2024, these are some expected step times.
|
||||
As of 9-11-2024, these are some expected step times.
|
||||
|
||||
| accelerator | global batch size | step time (seconds) |
|
||||
| ----------- | ----------------- | --------- |
|
||||
| v5p-512 | 16384 | 1.01 |
|
||||
| v5p-256 | 8192 | 1.01 |
|
||||
| v5p-128 | 4096 | 1.0 |
|
||||
| v5p-64 | 2048 | 1.01 |
|
||||
| v5p-128 | 1024 | 0.245 |
|
||||
| v5p-256 | 2048 | 0.234 |
|
||||
| v5p-512 | 4096 | 0.2498 |
|
||||
|
||||
## Create TPU
|
||||
|
||||
@@ -44,9 +43,8 @@ Install PyTorch and PyTorch/XLA nightly versions:
|
||||
gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} --zone=${ZONE} --worker=all \
|
||||
--command='
|
||||
pip3 install --pre torch==2.6.0.dev20241031+cpu torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip3 install "torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241031.cxx11-cp310-cp310-linux_x86_64.whl" -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||
pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
pip3 install --pre torch==2.5.0.dev20240905+cpu torchvision==0.20.0.dev20240905+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip3 install "torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.0.dev20240905-cp310-cp310-linux_x86_64.whl" -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||
'
|
||||
```
|
||||
|
||||
@@ -90,18 +88,17 @@ are fixed.
|
||||
gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} --zone=${ZONE} --worker=all \
|
||||
--command='
|
||||
export XLA_DISABLE_FUNCTIONALIZATION=0
|
||||
export XLA_DISABLE_FUNCTIONALIZATION=1
|
||||
export PROFILE_DIR=/tmp/
|
||||
export CACHE_DIR=/tmp/
|
||||
export DATASET_NAME=lambdalabs/naruto-blip-captions
|
||||
export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
|
||||
export TRAIN_STEPS=50
|
||||
export OUTPUT_DIR=/tmp/trained-model/
|
||||
python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=8 --loader_prefetch_size=4 --device_prefetch_size=4'
|
||||
python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=4 --loader_prefetch_size=4 --device_prefetch_size=4'
|
||||
|
||||
```
|
||||
|
||||
Pass `--print_loss` if you would like to see the loss printed at every step. Be aware that printing the loss at every step disrupts the optimized flow execution, thus the step time will be longer.
|
||||
|
||||
### Environment Envs Explained
|
||||
|
||||
* `XLA_DISABLE_FUNCTIONALIZATION`: To optimize the performance for AdamW optimizer.
|
||||
|
||||
@@ -140,43 +140,33 @@ class TrainSD:
|
||||
self.optimizer.step()
|
||||
|
||||
def start_training(self):
|
||||
dataloader_exception = False
|
||||
measure_start_step = args.measure_start_step
|
||||
assert measure_start_step < self.args.max_train_steps
|
||||
total_time = 0
|
||||
for step in range(0, self.args.max_train_steps):
|
||||
times = []
|
||||
last_time = time.time()
|
||||
step = 0
|
||||
while True:
|
||||
if self.global_step >= self.args.max_train_steps:
|
||||
xm.mark_step()
|
||||
break
|
||||
if step == 4 and PROFILE_DIR is not None:
|
||||
xm.wait_device_ops()
|
||||
xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
|
||||
try:
|
||||
batch = next(self.dataloader)
|
||||
except Exception as e:
|
||||
dataloader_exception = True
|
||||
print(e)
|
||||
break
|
||||
if step == measure_start_step and PROFILE_DIR is not None:
|
||||
xm.wait_device_ops()
|
||||
xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
|
||||
last_time = time.time()
|
||||
loss = self.step_fn(batch["pixel_values"], batch["input_ids"])
|
||||
step_time = time.time() - last_time
|
||||
if step >= 10:
|
||||
times.append(step_time)
|
||||
print(f"step: {step}, step_time: {step_time}")
|
||||
if step % 5 == 0:
|
||||
print(f"step: {step}, loss: {loss}")
|
||||
last_time = time.time()
|
||||
self.global_step += 1
|
||||
|
||||
def print_loss_closure(step, loss):
|
||||
print(f"Step: {step}, Loss: {loss}")
|
||||
|
||||
if args.print_loss:
|
||||
xm.add_step_closure(
|
||||
print_loss_closure,
|
||||
args=(
|
||||
self.global_step,
|
||||
loss,
|
||||
),
|
||||
)
|
||||
xm.mark_step()
|
||||
if not dataloader_exception:
|
||||
xm.wait_device_ops()
|
||||
total_time = time.time() - last_time
|
||||
print(f"Average step time: {total_time/(self.args.max_train_steps-measure_start_step)}")
|
||||
else:
|
||||
print("dataloader exception happen, skip result")
|
||||
return
|
||||
step += 1
|
||||
# print(f"Average step time: {sum(times)/len(times)}")
|
||||
xm.wait_device_ops()
|
||||
|
||||
def step_fn(
|
||||
self,
|
||||
@@ -190,10 +180,7 @@ class TrainSD:
|
||||
noise = torch.randn_like(latents).to(self.device, dtype=self.weight_dtype)
|
||||
bsz = latents.shape[0]
|
||||
timesteps = torch.randint(
|
||||
0,
|
||||
self.noise_scheduler.config.num_train_timesteps,
|
||||
(bsz,),
|
||||
device=latents.device,
|
||||
0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
|
||||
)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
@@ -237,6 +224,9 @@ class TrainSD:
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
|
||||
)
|
||||
parser.add_argument("--profile_duration", type=int, default=10000, help="Profile duration in ms")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
@@ -268,6 +258,12 @@ def parse_args():
|
||||
" or to a folder containing files that 🤗 Datasets can understand."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The config of the Dataset, leave as None if there's only one config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_data_dir",
|
||||
type=str,
|
||||
@@ -287,6 +283,15 @@ def parse_args():
|
||||
default="text",
|
||||
help="The column of the dataset containing a caption or a list of captions.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_train_samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
@@ -299,6 +304,7 @@ def parse_args():
|
||||
default=None,
|
||||
help="The directory where the downloaded models and datasets will be stored.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
@@ -368,19 +374,12 @@ def parse_args():
|
||||
default=1,
|
||||
help=("Number of subprocesses to use for data loading to cpu."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--loader_prefetch_factor",
|
||||
type=int,
|
||||
default=2,
|
||||
help=("Number of batches loaded in advance by each worker."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device_prefetch_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help=("Number of subprocesses to use for data loading to tpu from cpu. "),
|
||||
)
|
||||
parser.add_argument("--measure_start_step", type=int, default=10, help="Step to start profiling.")
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
@@ -395,8 +394,12 @@ def parse_args():
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "bf16"],
|
||||
help=("Whether to use mixed precision. Bf16 requires PyTorch >= 1.10"),
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
|
||||
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
|
||||
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
|
||||
@@ -406,12 +409,6 @@ def parse_args():
|
||||
default=None,
|
||||
help="The name of the repository to keep in sync with the local `output_dir`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print_loss",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=("Print loss at every step."),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -439,6 +436,7 @@ def load_dataset(args):
|
||||
# Downloading and loading a dataset from the hub.
|
||||
dataset = datasets.load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
cache_dir=args.cache_dir,
|
||||
data_dir=args.train_data_dir,
|
||||
)
|
||||
@@ -483,7 +481,9 @@ def main(args):
|
||||
_ = xp.start_server(PORT)
|
||||
|
||||
num_devices = xr.global_runtime_device_count()
|
||||
mesh = xs.get_1d_mesh("data")
|
||||
device_ids = np.arange(num_devices)
|
||||
mesh_shape = (num_devices, 1)
|
||||
mesh = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
|
||||
xs.set_global_mesh(mesh)
|
||||
|
||||
text_encoder = CLIPTextModel.from_pretrained(
|
||||
@@ -520,7 +520,6 @@ def main(args):
|
||||
from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
|
||||
|
||||
unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
|
||||
unet.enable_xla_flash_attention(partition_spec=("data", None, None, None))
|
||||
|
||||
vae.requires_grad_(False)
|
||||
text_encoder.requires_grad_(False)
|
||||
@@ -531,12 +530,15 @@ def main(args):
|
||||
# as these weights are only used for inference, keeping weights in full
|
||||
# precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if args.mixed_precision == "bf16":
|
||||
if args.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif args.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
|
||||
device = xm.xla_device()
|
||||
print("device: ", device)
|
||||
print("weight_dtype: ", weight_dtype)
|
||||
|
||||
# Move text_encode and vae to device and cast to weight_dtype
|
||||
text_encoder = text_encoder.to(device, dtype=weight_dtype)
|
||||
vae = vae.to(device, dtype=weight_dtype)
|
||||
unet = unet.to(device, dtype=weight_dtype)
|
||||
@@ -604,27 +606,24 @@ def main(args):
|
||||
collate_fn=collate_fn,
|
||||
num_workers=args.dataloader_num_workers,
|
||||
batch_size=args.train_batch_size,
|
||||
prefetch_factor=args.loader_prefetch_factor,
|
||||
)
|
||||
|
||||
train_dataloader = pl.MpDeviceLoader(
|
||||
train_dataloader,
|
||||
device,
|
||||
input_sharding={
|
||||
"pixel_values": xs.ShardingSpec(mesh, ("data", None, None, None), minibatch=True),
|
||||
"input_ids": xs.ShardingSpec(mesh, ("data", None), minibatch=True),
|
||||
"pixel_values": xs.ShardingSpec(mesh, ("x", None, None, None), minibatch=True),
|
||||
"input_ids": xs.ShardingSpec(mesh, ("x", None), minibatch=True),
|
||||
},
|
||||
loader_prefetch_size=args.loader_prefetch_size,
|
||||
device_prefetch_size=args.device_prefetch_size,
|
||||
)
|
||||
|
||||
num_hosts = xr.process_count()
|
||||
num_devices_per_host = num_devices // num_hosts
|
||||
if xm.is_master_ordinal():
|
||||
print("***** Running training *****")
|
||||
print(f"Instantaneous batch size per device = {args.train_batch_size // num_devices_per_host }")
|
||||
print(f"Instantaneous batch size per device = {args.train_batch_size}")
|
||||
print(
|
||||
f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_hosts}"
|
||||
f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_devices}"
|
||||
)
|
||||
print(f" Total optimization steps = {args.max_train_steps}")
|
||||
|
||||
|
||||
@@ -483,6 +483,7 @@ def parse_args(input_args=None):
|
||||
# Sanity checks
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("Need either a dataset name or a training folder.")
|
||||
|
||||
if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
|
||||
raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
|
||||
|
||||
@@ -823,7 +824,9 @@ def main(args):
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
dataset = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
cache_dir=args.cache_dir,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -36,7 +36,7 @@ from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
|
||||
from diffusers.utils.import_utils import is_accelerate_available
|
||||
|
||||
|
||||
CTX = init_empty_weights if is_accelerate_available() else nullcontext
|
||||
CTX = init_empty_weights if is_accelerate_available else nullcontext
|
||||
|
||||
TOKENIZER_MAX_LENGTH = 224
|
||||
|
||||
|
||||
@@ -1,323 +0,0 @@
|
||||
import argparse
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from diffusers import AutoencoderDC
|
||||
|
||||
|
||||
def remap_qkv_(key: str, state_dict: Dict[str, Any]):
|
||||
qkv = state_dict.pop(key)
|
||||
q, k, v = torch.chunk(qkv, 3, dim=0)
|
||||
parent_module, _, _ = key.rpartition(".qkv.conv.weight")
|
||||
state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
|
||||
state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
|
||||
state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
|
||||
|
||||
|
||||
def remap_proj_conv_(key: str, state_dict: Dict[str, Any]):
|
||||
parent_module, _, _ = key.rpartition(".proj.conv.weight")
|
||||
state_dict[f"{parent_module}.to_out.weight"] = state_dict.pop(key).squeeze()
|
||||
|
||||
|
||||
AE_KEYS_RENAME_DICT = {
|
||||
# common
|
||||
"main.": "",
|
||||
"op_list.": "",
|
||||
"context_module": "attn",
|
||||
"local_module": "conv_out",
|
||||
# NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
|
||||
# If there were more scales, there would be more layers, so a loop would be better to handle this
|
||||
"aggreg.0.0": "to_qkv_multiscale.0.proj_in",
|
||||
"aggreg.0.1": "to_qkv_multiscale.0.proj_out",
|
||||
"depth_conv.conv": "conv_depth",
|
||||
"inverted_conv.conv": "conv_inverted",
|
||||
"point_conv.conv": "conv_point",
|
||||
"point_conv.norm": "norm",
|
||||
"conv.conv.": "conv.",
|
||||
"conv1.conv": "conv1",
|
||||
"conv2.conv": "conv2",
|
||||
"conv2.norm": "norm",
|
||||
"proj.norm": "norm_out",
|
||||
# encoder
|
||||
"encoder.project_in.conv": "encoder.conv_in",
|
||||
"encoder.project_out.0.conv": "encoder.conv_out",
|
||||
"encoder.stages": "encoder.down_blocks",
|
||||
# decoder
|
||||
"decoder.project_in.conv": "decoder.conv_in",
|
||||
"decoder.project_out.0": "decoder.norm_out",
|
||||
"decoder.project_out.2.conv": "decoder.conv_out",
|
||||
"decoder.stages": "decoder.up_blocks",
|
||||
}
|
||||
|
||||
AE_F32C32_KEYS = {
|
||||
# encoder
|
||||
"encoder.project_in.conv": "encoder.conv_in.conv",
|
||||
# decoder
|
||||
"decoder.project_out.2.conv": "decoder.conv_out.conv",
|
||||
}
|
||||
|
||||
AE_F64C128_KEYS = {
|
||||
# encoder
|
||||
"encoder.project_in.conv": "encoder.conv_in.conv",
|
||||
# decoder
|
||||
"decoder.project_out.2.conv": "decoder.conv_out.conv",
|
||||
}
|
||||
|
||||
AE_F128C512_KEYS = {
|
||||
# encoder
|
||||
"encoder.project_in.conv": "encoder.conv_in.conv",
|
||||
# decoder
|
||||
"decoder.project_out.2.conv": "decoder.conv_out.conv",
|
||||
}
|
||||
|
||||
AE_SPECIAL_KEYS_REMAP = {
|
||||
"qkv.conv.weight": remap_qkv_,
|
||||
"proj.conv.weight": remap_proj_conv_,
|
||||
}
|
||||
|
||||
|
||||
def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
state_dict = saved_dict
|
||||
if "model" in saved_dict.keys():
|
||||
state_dict = state_dict["model"]
|
||||
if "module" in saved_dict.keys():
|
||||
state_dict = state_dict["module"]
|
||||
if "state_dict" in saved_dict.keys():
|
||||
state_dict = state_dict["state_dict"]
|
||||
return state_dict
|
||||
|
||||
|
||||
def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
|
||||
def convert_ae(config_name: str, dtype: torch.dtype):
|
||||
config = get_ae_config(config_name)
|
||||
hub_id = f"mit-han-lab/{config_name}"
|
||||
ckpt_path = hf_hub_download(hub_id, "model.safetensors")
|
||||
original_state_dict = get_state_dict(load_file(ckpt_path))
|
||||
|
||||
ae = AutoencoderDC(**config).to(dtype=dtype)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
new_key = key[:]
|
||||
for replace_key, rename_key in AE_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
update_state_dict_(original_state_dict, key, new_key)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in AE_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, original_state_dict)
|
||||
|
||||
ae.load_state_dict(original_state_dict, strict=True)
|
||||
return ae
|
||||
|
||||
|
||||
def get_ae_config(name: str):
|
||||
if name in ["dc-ae-f32c32-sana-1.0"]:
|
||||
config = {
|
||||
"latent_channels": 32,
|
||||
"encoder_block_types": (
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
),
|
||||
"decoder_block_types": (
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
),
|
||||
"encoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
|
||||
"decoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
|
||||
"encoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
|
||||
"decoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
|
||||
"encoder_layers_per_block": (2, 2, 2, 3, 3, 3),
|
||||
"decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
|
||||
"downsample_block_type": "conv",
|
||||
"upsample_block_type": "interpolate",
|
||||
"decoder_norm_types": "rms_norm",
|
||||
"decoder_act_fns": "silu",
|
||||
"scaling_factor": 0.41407,
|
||||
}
|
||||
elif name in ["dc-ae-f32c32-in-1.0", "dc-ae-f32c32-mix-1.0"]:
|
||||
AE_KEYS_RENAME_DICT.update(AE_F32C32_KEYS)
|
||||
config = {
|
||||
"latent_channels": 32,
|
||||
"encoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"decoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
|
||||
"decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
|
||||
"encoder_layers_per_block": [0, 4, 8, 2, 2, 2],
|
||||
"decoder_layers_per_block": [0, 5, 10, 2, 2, 2],
|
||||
"encoder_qkv_multiscales": ((), (), (), (), (), ()),
|
||||
"decoder_qkv_multiscales": ((), (), (), (), (), ()),
|
||||
"decoder_norm_types": ["batch_norm", "batch_norm", "batch_norm", "rms_norm", "rms_norm", "rms_norm"],
|
||||
"decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu"],
|
||||
}
|
||||
if name == "dc-ae-f32c32-in-1.0":
|
||||
config["scaling_factor"] = 0.3189
|
||||
elif name == "dc-ae-f32c32-mix-1.0":
|
||||
config["scaling_factor"] = 0.4552
|
||||
elif name in ["dc-ae-f64c128-in-1.0", "dc-ae-f64c128-mix-1.0"]:
|
||||
AE_KEYS_RENAME_DICT.update(AE_F64C128_KEYS)
|
||||
config = {
|
||||
"latent_channels": 128,
|
||||
"encoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"decoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
|
||||
"decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
|
||||
"encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2],
|
||||
"decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2],
|
||||
"encoder_qkv_multiscales": ((), (), (), (), (), (), ()),
|
||||
"decoder_qkv_multiscales": ((), (), (), (), (), (), ()),
|
||||
"decoder_norm_types": [
|
||||
"batch_norm",
|
||||
"batch_norm",
|
||||
"batch_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
],
|
||||
"decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu"],
|
||||
}
|
||||
if name == "dc-ae-f64c128-in-1.0":
|
||||
config["scaling_factor"] = 0.2889
|
||||
elif name == "dc-ae-f64c128-mix-1.0":
|
||||
config["scaling_factor"] = 0.4538
|
||||
elif name in ["dc-ae-f128c512-in-1.0", "dc-ae-f128c512-mix-1.0"]:
|
||||
AE_KEYS_RENAME_DICT.update(AE_F128C512_KEYS)
|
||||
config = {
|
||||
"latent_channels": 512,
|
||||
"encoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"decoder_block_types": [
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"ResBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
"EfficientViTBlock",
|
||||
],
|
||||
"encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
|
||||
"decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
|
||||
"encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2, 2],
|
||||
"decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2, 2],
|
||||
"encoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
|
||||
"decoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
|
||||
"decoder_norm_types": [
|
||||
"batch_norm",
|
||||
"batch_norm",
|
||||
"batch_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
"rms_norm",
|
||||
],
|
||||
"decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu", "silu"],
|
||||
}
|
||||
if name == "dc-ae-f128c512-in-1.0":
|
||||
config["scaling_factor"] = 0.4883
|
||||
elif name == "dc-ae-f128c512-mix-1.0":
|
||||
config["scaling_factor"] = 0.3620
|
||||
else:
|
||||
raise ValueError("Invalid config name provided.")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--config_name",
|
||||
type=str,
|
||||
default="dc-ae-f32c32-sana-1.0",
|
||||
choices=[
|
||||
"dc-ae-f32c32-sana-1.0",
|
||||
"dc-ae-f32c32-in-1.0",
|
||||
"dc-ae-f32c32-mix-1.0",
|
||||
"dc-ae-f64c128-in-1.0",
|
||||
"dc-ae-f64c128-mix-1.0",
|
||||
"dc-ae-f128c512-in-1.0",
|
||||
"dc-ae-f128c512-mix-1.0",
|
||||
],
|
||||
help="The DCAE checkpoint to convert",
|
||||
)
|
||||
parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
|
||||
parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
DTYPE_MAPPING = {
|
||||
"fp32": torch.float32,
|
||||
"fp16": torch.float16,
|
||||
"bf16": torch.bfloat16,
|
||||
}
|
||||
|
||||
VARIANT_MAPPING = {
|
||||
"fp32": None,
|
||||
"fp16": "fp16",
|
||||
"bf16": "bf16",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
dtype = DTYPE_MAPPING[args.dtype]
|
||||
variant = VARIANT_MAPPING[args.dtype]
|
||||
|
||||
ae = convert_ae(args.config_name, dtype)
|
||||
ae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant)
|
||||
@@ -31,14 +31,12 @@ python scripts/convert_flux_to_diffusers.py \
|
||||
--vae
|
||||
"""
|
||||
|
||||
CTX = init_empty_weights if is_accelerate_available() else nullcontext
|
||||
CTX = init_empty_weights if is_accelerate_available else nullcontext
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
|
||||
parser.add_argument("--filename", default="flux.safetensors", type=str)
|
||||
parser.add_argument("--checkpoint_path", default=None, type=str)
|
||||
parser.add_argument("--in_channels", type=int, default=64)
|
||||
parser.add_argument("--out_channels", type=int, default=None)
|
||||
parser.add_argument("--vae", action="store_true")
|
||||
parser.add_argument("--transformer", action="store_true")
|
||||
parser.add_argument("--output_path", type=str)
|
||||
@@ -281,13 +279,10 @@ def main(args):
|
||||
num_single_layers = 38
|
||||
inner_dim = 3072
|
||||
mlp_ratio = 4.0
|
||||
|
||||
converted_transformer_state_dict = convert_flux_transformer_checkpoint_to_diffusers(
|
||||
original_ckpt, num_layers, num_single_layers, inner_dim, mlp_ratio=mlp_ratio
|
||||
)
|
||||
transformer = FluxTransformer2DModel(
|
||||
in_channels=args.in_channels, out_channels=args.out_channels, guidance_embeds=has_guidance
|
||||
)
|
||||
transformer = FluxTransformer2DModel(guidance_embeds=has_guidance)
|
||||
transformer.load_state_dict(converted_transformer_state_dict, strict=True)
|
||||
|
||||
print(
|
||||
|
||||
@@ -1,209 +0,0 @@
|
||||
import argparse
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from transformers import T5EncoderModel, T5Tokenizer
|
||||
|
||||
from diffusers import AutoencoderKLLTXVideo, FlowMatchEulerDiscreteScheduler, LTXPipeline, LTXVideoTransformer3DModel
|
||||
|
||||
|
||||
def remove_keys_(key: str, state_dict: Dict[str, Any]):
|
||||
state_dict.pop(key)
|
||||
|
||||
|
||||
TOKENIZER_MAX_LENGTH = 128
|
||||
|
||||
TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"patchify_proj": "proj_in",
|
||||
"adaln_single": "time_embed",
|
||||
"q_norm": "norm_q",
|
||||
"k_norm": "norm_k",
|
||||
}
|
||||
|
||||
TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
|
||||
VAE_KEYS_RENAME_DICT = {
|
||||
# decoder
|
||||
"up_blocks.0": "mid_block",
|
||||
"up_blocks.1": "up_blocks.0",
|
||||
"up_blocks.2": "up_blocks.1.upsamplers.0",
|
||||
"up_blocks.3": "up_blocks.1",
|
||||
"up_blocks.4": "up_blocks.2.conv_in",
|
||||
"up_blocks.5": "up_blocks.2.upsamplers.0",
|
||||
"up_blocks.6": "up_blocks.2",
|
||||
"up_blocks.7": "up_blocks.3.conv_in",
|
||||
"up_blocks.8": "up_blocks.3.upsamplers.0",
|
||||
"up_blocks.9": "up_blocks.3",
|
||||
# encoder
|
||||
"down_blocks.0": "down_blocks.0",
|
||||
"down_blocks.1": "down_blocks.0.downsamplers.0",
|
||||
"down_blocks.2": "down_blocks.0.conv_out",
|
||||
"down_blocks.3": "down_blocks.1",
|
||||
"down_blocks.4": "down_blocks.1.downsamplers.0",
|
||||
"down_blocks.5": "down_blocks.1.conv_out",
|
||||
"down_blocks.6": "down_blocks.2",
|
||||
"down_blocks.7": "down_blocks.2.downsamplers.0",
|
||||
"down_blocks.8": "down_blocks.3",
|
||||
"down_blocks.9": "mid_block",
|
||||
# common
|
||||
"conv_shortcut": "conv_shortcut.conv",
|
||||
"res_blocks": "resnets",
|
||||
"norm3.norm": "norm3",
|
||||
"per_channel_statistics.mean-of-means": "latents_mean",
|
||||
"per_channel_statistics.std-of-means": "latents_std",
|
||||
}
|
||||
|
||||
VAE_SPECIAL_KEYS_REMAP = {
|
||||
"per_channel_statistics.channel": remove_keys_,
|
||||
"per_channel_statistics.mean-of-means": remove_keys_,
|
||||
"per_channel_statistics.mean-of-stds": remove_keys_,
|
||||
}
|
||||
|
||||
|
||||
def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
state_dict = saved_dict
|
||||
if "model" in saved_dict.keys():
|
||||
state_dict = state_dict["model"]
|
||||
if "module" in saved_dict.keys():
|
||||
state_dict = state_dict["module"]
|
||||
if "state_dict" in saved_dict.keys():
|
||||
state_dict = state_dict["state_dict"]
|
||||
return state_dict
|
||||
|
||||
|
||||
def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
|
||||
def convert_transformer(
|
||||
ckpt_path: str,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
PREFIX_KEY = ""
|
||||
|
||||
original_state_dict = get_state_dict(load_file(ckpt_path))
|
||||
transformer = LTXVideoTransformer3DModel().to(dtype=dtype)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
new_key = key[len(PREFIX_KEY) :]
|
||||
for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
update_state_dict_inplace(original_state_dict, key, new_key)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, original_state_dict)
|
||||
|
||||
transformer.load_state_dict(original_state_dict, strict=True)
|
||||
return transformer
|
||||
|
||||
|
||||
def convert_vae(ckpt_path: str, dtype: torch.dtype):
|
||||
original_state_dict = get_state_dict(load_file(ckpt_path))
|
||||
vae = AutoencoderKLLTXVideo().to(dtype=dtype)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
new_key = key[:]
|
||||
for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
update_state_dict_inplace(original_state_dict, key, new_key)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, original_state_dict)
|
||||
|
||||
vae.load_state_dict(original_state_dict, strict=True)
|
||||
return vae
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
|
||||
)
|
||||
parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
|
||||
parser.add_argument(
|
||||
"--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--typecast_text_encoder",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Whether or not to apply fp16/bf16 precision to text_encoder",
|
||||
)
|
||||
parser.add_argument("--save_pipeline", action="store_true")
|
||||
parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
|
||||
parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
DTYPE_MAPPING = {
|
||||
"fp32": torch.float32,
|
||||
"fp16": torch.float16,
|
||||
"bf16": torch.bfloat16,
|
||||
}
|
||||
|
||||
VARIANT_MAPPING = {
|
||||
"fp32": None,
|
||||
"fp16": "fp16",
|
||||
"bf16": "bf16",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
transformer = None
|
||||
dtype = DTYPE_MAPPING[args.dtype]
|
||||
variant = VARIANT_MAPPING[args.dtype]
|
||||
|
||||
if args.save_pipeline:
|
||||
assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
|
||||
|
||||
if args.transformer_ckpt_path is not None:
|
||||
transformer: LTXVideoTransformer3DModel = convert_transformer(args.transformer_ckpt_path, dtype)
|
||||
if not args.save_pipeline:
|
||||
transformer.save_pretrained(
|
||||
args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant
|
||||
)
|
||||
|
||||
if args.vae_ckpt_path is not None:
|
||||
vae: AutoencoderKLLTXVideo = convert_vae(args.vae_ckpt_path, dtype)
|
||||
if not args.save_pipeline:
|
||||
vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant)
|
||||
|
||||
if args.save_pipeline:
|
||||
text_encoder_id = "google/t5-v1_1-xxl"
|
||||
tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
|
||||
text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
|
||||
|
||||
if args.typecast_text_encoder:
|
||||
text_encoder = text_encoder.to(dtype=dtype)
|
||||
|
||||
# Apparently, the conversion does not work anymore without this :shrug:
|
||||
for param in text_encoder.parameters():
|
||||
param.data = param.data.contiguous()
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(
|
||||
use_dynamic_shifting=True,
|
||||
base_shift=0.95,
|
||||
max_shift=2.05,
|
||||
base_image_seq_len=1024,
|
||||
max_image_seq_len=4096,
|
||||
shift_terminal=0.1,
|
||||
)
|
||||
|
||||
pipe = LTXPipeline(
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
transformer=transformer,
|
||||
)
|
||||
|
||||
pipe.save_pretrained(args.output_path, safe_serialization=True, variant=variant, max_shard_size="5GB")
|
||||
@@ -10,7 +10,7 @@ from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, Mochi
|
||||
from diffusers.utils.import_utils import is_accelerate_available
|
||||
|
||||
|
||||
CTX = init_empty_weights if is_accelerate_available() else nullcontext
|
||||
CTX = init_empty_weights if is_accelerate_available else nullcontext
|
||||
|
||||
TOKENIZER_MAX_LENGTH = 256
|
||||
|
||||
|
||||
@@ -1,307 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from contextlib import nullcontext
|
||||
|
||||
import torch
|
||||
from accelerate import init_empty_weights
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
from termcolor import colored
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderDC,
|
||||
DPMSolverMultistepScheduler,
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
SanaPipeline,
|
||||
SanaTransformer2DModel,
|
||||
)
|
||||
from diffusers.models.modeling_utils import load_model_dict_into_meta
|
||||
from diffusers.utils.import_utils import is_accelerate_available
|
||||
|
||||
|
||||
CTX = init_empty_weights if is_accelerate_available else nullcontext
|
||||
|
||||
ckpt_ids = [
|
||||
"Efficient-Large-Model/Sana_1600M_1024px_MultiLing/checkpoints/Sana_1600M_1024px_MultiLing.pth",
|
||||
"Efficient-Large-Model/Sana_1600M_1024px_BF16/checkpoints/Sana_1600M_1024px_BF16.pth",
|
||||
"Efficient-Large-Model/Sana_1600M_512px_MultiLing/checkpoints/Sana_1600M_512px_MultiLing.pth",
|
||||
"Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",
|
||||
"Efficient-Large-Model/Sana_1600M_512px/checkpoints/Sana_1600M_512px.pth",
|
||||
"Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px_MultiLing.pth",
|
||||
"Efficient-Large-Model/Sana_600M_512px/checkpoints/Sana_600M_512px_MultiLing.pth",
|
||||
]
|
||||
# https://github.com/NVlabs/Sana/blob/main/scripts/inference.py
|
||||
|
||||
|
||||
def main(args):
|
||||
cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
|
||||
|
||||
if args.orig_ckpt_path is None or args.orig_ckpt_path in ckpt_ids:
|
||||
ckpt_id = args.orig_ckpt_path or ckpt_ids[0]
|
||||
snapshot_download(
|
||||
repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
|
||||
cache_dir=cache_dir_path,
|
||||
repo_type="model",
|
||||
)
|
||||
file_path = hf_hub_download(
|
||||
repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
|
||||
filename=f"{'/'.join(ckpt_id.split('/')[2:])}",
|
||||
cache_dir=cache_dir_path,
|
||||
repo_type="model",
|
||||
)
|
||||
else:
|
||||
file_path = args.orig_ckpt_path
|
||||
|
||||
print(colored(f"Loading checkpoint from {file_path}", "green", attrs=["bold"]))
|
||||
all_state_dict = torch.load(file_path, weights_only=True)
|
||||
state_dict = all_state_dict.pop("state_dict")
|
||||
converted_state_dict = {}
|
||||
|
||||
# Patch embeddings.
|
||||
converted_state_dict["patch_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
|
||||
converted_state_dict["patch_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
|
||||
|
||||
# Caption projection.
|
||||
converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
|
||||
converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
|
||||
converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
|
||||
converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
|
||||
|
||||
# AdaLN-single LN
|
||||
converted_state_dict["time_embed.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
|
||||
"t_embedder.mlp.0.weight"
|
||||
)
|
||||
converted_state_dict["time_embed.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
|
||||
converted_state_dict["time_embed.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
|
||||
"t_embedder.mlp.2.weight"
|
||||
)
|
||||
converted_state_dict["time_embed.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
|
||||
|
||||
# Shared norm.
|
||||
converted_state_dict["time_embed.linear.weight"] = state_dict.pop("t_block.1.weight")
|
||||
converted_state_dict["time_embed.linear.bias"] = state_dict.pop("t_block.1.bias")
|
||||
|
||||
# y norm
|
||||
converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
|
||||
|
||||
flow_shift = 3.0
|
||||
if args.model_type == "SanaMS_1600M_P1_D20":
|
||||
layer_num = 20
|
||||
elif args.model_type == "SanaMS_600M_P1_D28":
|
||||
layer_num = 28
|
||||
else:
|
||||
raise ValueError(f"{args.model_type} is not supported.")
|
||||
|
||||
for depth in range(layer_num):
|
||||
# Transformer blocks.
|
||||
converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
|
||||
f"blocks.{depth}.scale_shift_table"
|
||||
)
|
||||
|
||||
# Linear Attention is all you need 🤘
|
||||
# Self attention.
|
||||
q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
|
||||
# Projection.
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
|
||||
f"blocks.{depth}.attn.proj.weight"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
|
||||
f"blocks.{depth}.attn.proj.bias"
|
||||
)
|
||||
|
||||
# Feed-forward.
|
||||
converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.weight"] = state_dict.pop(
|
||||
f"blocks.{depth}.mlp.inverted_conv.conv.weight"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.bias"] = state_dict.pop(
|
||||
f"blocks.{depth}.mlp.inverted_conv.conv.bias"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.weight"] = state_dict.pop(
|
||||
f"blocks.{depth}.mlp.depth_conv.conv.weight"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.bias"] = state_dict.pop(
|
||||
f"blocks.{depth}.mlp.depth_conv.conv.bias"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.ff.conv_point.weight"] = state_dict.pop(
|
||||
f"blocks.{depth}.mlp.point_conv.conv.weight"
|
||||
)
|
||||
|
||||
# Cross-attention.
|
||||
q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
|
||||
q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
|
||||
k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
|
||||
k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
|
||||
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
|
||||
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
|
||||
f"blocks.{depth}.cross_attn.proj.weight"
|
||||
)
|
||||
converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
|
||||
f"blocks.{depth}.cross_attn.proj.bias"
|
||||
)
|
||||
|
||||
# Final block.
|
||||
converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
|
||||
converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
|
||||
converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
|
||||
|
||||
# Transformer
|
||||
with CTX():
|
||||
transformer = SanaTransformer2DModel(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
num_attention_heads=model_kwargs[args.model_type]["num_attention_heads"],
|
||||
attention_head_dim=model_kwargs[args.model_type]["attention_head_dim"],
|
||||
num_layers=model_kwargs[args.model_type]["num_layers"],
|
||||
num_cross_attention_heads=model_kwargs[args.model_type]["num_cross_attention_heads"],
|
||||
cross_attention_head_dim=model_kwargs[args.model_type]["cross_attention_head_dim"],
|
||||
cross_attention_dim=model_kwargs[args.model_type]["cross_attention_dim"],
|
||||
caption_channels=2304,
|
||||
mlp_ratio=2.5,
|
||||
attention_bias=False,
|
||||
sample_size=args.image_size // 32,
|
||||
patch_size=1,
|
||||
norm_elementwise_affine=False,
|
||||
norm_eps=1e-6,
|
||||
)
|
||||
|
||||
if is_accelerate_available():
|
||||
load_model_dict_into_meta(transformer, converted_state_dict)
|
||||
else:
|
||||
transformer.load_state_dict(converted_state_dict, strict=True, assign=True)
|
||||
|
||||
try:
|
||||
state_dict.pop("y_embedder.y_embedding")
|
||||
state_dict.pop("pos_embed")
|
||||
except KeyError:
|
||||
print("y_embedder.y_embedding or pos_embed not found in the state_dict")
|
||||
|
||||
assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
|
||||
|
||||
num_model_params = sum(p.numel() for p in transformer.parameters())
|
||||
print(f"Total number of transformer parameters: {num_model_params}")
|
||||
|
||||
transformer = transformer.to(weight_dtype)
|
||||
|
||||
if not args.save_full_pipeline:
|
||||
print(
|
||||
colored(
|
||||
f"Only saving transformer model of {args.model_type}. "
|
||||
f"Set --save_full_pipeline to save the whole SanaPipeline",
|
||||
"green",
|
||||
attrs=["bold"],
|
||||
)
|
||||
)
|
||||
transformer.save_pretrained(
|
||||
os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB", variant=variant
|
||||
)
|
||||
else:
|
||||
print(colored(f"Saving the whole SanaPipeline containing {args.model_type}", "green", attrs=["bold"]))
|
||||
# VAE
|
||||
ae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", torch_dtype=torch.float32)
|
||||
|
||||
# Text Encoder
|
||||
text_encoder_model_path = "google/gemma-2-2b-it"
|
||||
tokenizer = AutoTokenizer.from_pretrained(text_encoder_model_path)
|
||||
tokenizer.padding_side = "right"
|
||||
text_encoder = AutoModelForCausalLM.from_pretrained(
|
||||
text_encoder_model_path, torch_dtype=torch.bfloat16
|
||||
).get_decoder()
|
||||
|
||||
# Scheduler
|
||||
if args.scheduler_type == "flow-dpm_solver":
|
||||
scheduler = DPMSolverMultistepScheduler(
|
||||
flow_shift=flow_shift,
|
||||
use_flow_sigmas=True,
|
||||
prediction_type="flow_prediction",
|
||||
)
|
||||
elif args.scheduler_type == "flow-euler":
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
|
||||
else:
|
||||
raise ValueError(f"Scheduler type {args.scheduler_type} is not supported")
|
||||
|
||||
pipe = SanaPipeline(
|
||||
tokenizer=tokenizer,
|
||||
text_encoder=text_encoder,
|
||||
transformer=transformer,
|
||||
vae=ae,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
pipe.save_pretrained(args.dump_path, safe_serialization=True, max_shard_size="5GB", variant=variant)
|
||||
|
||||
|
||||
DTYPE_MAPPING = {
|
||||
"fp32": torch.float32,
|
||||
"fp16": torch.float16,
|
||||
"bf16": torch.bfloat16,
|
||||
}
|
||||
|
||||
VARIANT_MAPPING = {
|
||||
"fp32": None,
|
||||
"fp16": "fp16",
|
||||
"bf16": "bf16",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_size",
|
||||
default=1024,
|
||||
type=int,
|
||||
choices=[512, 1024],
|
||||
required=False,
|
||||
help="Image size of pretrained model, 512 or 1024.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type", default="SanaMS_1600M_P1_D20", type=str, choices=["SanaMS_1600M_P1_D20", "SanaMS_600M_P1_D28"]
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scheduler_type", default="flow-dpm_solver", type=str, choices=["flow-dpm_solver", "flow-euler"]
|
||||
)
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
|
||||
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipelien elemets in one.")
|
||||
parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
model_kwargs = {
|
||||
"SanaMS_1600M_P1_D20": {
|
||||
"num_attention_heads": 70,
|
||||
"attention_head_dim": 32,
|
||||
"num_cross_attention_heads": 20,
|
||||
"cross_attention_head_dim": 112,
|
||||
"cross_attention_dim": 2240,
|
||||
"num_layers": 20,
|
||||
},
|
||||
"SanaMS_600M_P1_D28": {
|
||||
"num_attention_heads": 36,
|
||||
"attention_head_dim": 32,
|
||||
"num_cross_attention_heads": 16,
|
||||
"cross_attention_head_dim": 72,
|
||||
"cross_attention_dim": 1152,
|
||||
"num_layers": 28,
|
||||
},
|
||||
}
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
weight_dtype = DTYPE_MAPPING[args.dtype]
|
||||
variant = VARIANT_MAPPING[args.dtype]
|
||||
|
||||
main(args)
|
||||
@@ -1,185 +0,0 @@
|
||||
"""
|
||||
A script to convert Stable Diffusion 3.5 ControlNet checkpoints to the Diffusers format.
|
||||
|
||||
Example:
|
||||
Convert a SD3.5 ControlNet checkpoint to Diffusers format using local file:
|
||||
```bash
|
||||
python scripts/convert_sd3_controlnet_to_diffusers.py \
|
||||
--checkpoint_path "path/to/local/sd3.5_large_controlnet_canny.safetensors" \
|
||||
--output_path "output/sd35-controlnet-canny" \
|
||||
--dtype "fp16" # optional, defaults to fp32
|
||||
```
|
||||
|
||||
Or download and convert from HuggingFace repository:
|
||||
```bash
|
||||
python scripts/convert_sd3_controlnet_to_diffusers.py \
|
||||
--original_state_dict_repo_id "stabilityai/stable-diffusion-3.5-controlnets" \
|
||||
--filename "sd3.5_large_controlnet_canny.safetensors" \
|
||||
--output_path "/raid/yiyi/sd35-controlnet-canny-diffusers" \
|
||||
--dtype "fp32" # optional, defaults to fp32
|
||||
```
|
||||
|
||||
Note:
|
||||
The script supports the following ControlNet types from SD3.5:
|
||||
- Canny edge detection
|
||||
- Depth estimation
|
||||
- Blur detection
|
||||
|
||||
The checkpoint files can be downloaded from:
|
||||
https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import safetensors.torch
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from diffusers import SD3ControlNetModel
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--checkpoint_path", type=str, default=None, help="Path to local checkpoint file")
|
||||
parser.add_argument(
|
||||
"--original_state_dict_repo_id", type=str, default=None, help="HuggingFace repo ID containing the checkpoint"
|
||||
)
|
||||
parser.add_argument("--filename", type=str, default=None, help="Filename of the checkpoint in the HF repo")
|
||||
parser.add_argument("--output_path", type=str, required=True, help="Path to save the converted model")
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, default="fp32", help="Data type for the converted model (fp16, bf16, or fp32)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def load_original_checkpoint(args):
|
||||
if args.original_state_dict_repo_id is not None:
|
||||
if args.filename is None:
|
||||
raise ValueError("When using `original_state_dict_repo_id`, `filename` must also be specified")
|
||||
print(f"Downloading checkpoint from {args.original_state_dict_repo_id}/{args.filename}")
|
||||
ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
|
||||
elif args.checkpoint_path is not None:
|
||||
print(f"Loading checkpoint from local path: {args.checkpoint_path}")
|
||||
ckpt_path = args.checkpoint_path
|
||||
else:
|
||||
raise ValueError("Please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
|
||||
|
||||
original_state_dict = safetensors.torch.load_file(ckpt_path)
|
||||
return original_state_dict
|
||||
|
||||
|
||||
def convert_sd3_controlnet_checkpoint_to_diffusers(original_state_dict):
|
||||
converted_state_dict = {}
|
||||
|
||||
# Direct mappings for controlnet blocks
|
||||
for i in range(19): # 19 controlnet blocks
|
||||
converted_state_dict[f"controlnet_blocks.{i}.weight"] = original_state_dict[f"controlnet_blocks.{i}.weight"]
|
||||
converted_state_dict[f"controlnet_blocks.{i}.bias"] = original_state_dict[f"controlnet_blocks.{i}.bias"]
|
||||
|
||||
# Positional embeddings
|
||||
converted_state_dict["pos_embed_input.proj.weight"] = original_state_dict["pos_embed_input.proj.weight"]
|
||||
converted_state_dict["pos_embed_input.proj.bias"] = original_state_dict["pos_embed_input.proj.bias"]
|
||||
|
||||
# Time and text embeddings
|
||||
time_text_mappings = {
|
||||
"time_text_embed.timestep_embedder.linear_1.weight": "time_text_embed.timestep_embedder.linear_1.weight",
|
||||
"time_text_embed.timestep_embedder.linear_1.bias": "time_text_embed.timestep_embedder.linear_1.bias",
|
||||
"time_text_embed.timestep_embedder.linear_2.weight": "time_text_embed.timestep_embedder.linear_2.weight",
|
||||
"time_text_embed.timestep_embedder.linear_2.bias": "time_text_embed.timestep_embedder.linear_2.bias",
|
||||
"time_text_embed.text_embedder.linear_1.weight": "time_text_embed.text_embedder.linear_1.weight",
|
||||
"time_text_embed.text_embedder.linear_1.bias": "time_text_embed.text_embedder.linear_1.bias",
|
||||
"time_text_embed.text_embedder.linear_2.weight": "time_text_embed.text_embedder.linear_2.weight",
|
||||
"time_text_embed.text_embedder.linear_2.bias": "time_text_embed.text_embedder.linear_2.bias",
|
||||
}
|
||||
|
||||
for new_key, old_key in time_text_mappings.items():
|
||||
if old_key in original_state_dict:
|
||||
converted_state_dict[new_key] = original_state_dict[old_key]
|
||||
|
||||
# Transformer blocks
|
||||
for i in range(19):
|
||||
# Split QKV into separate Q, K, V
|
||||
qkv_weight = original_state_dict[f"transformer_blocks.{i}.attn.qkv.weight"]
|
||||
qkv_bias = original_state_dict[f"transformer_blocks.{i}.attn.qkv.bias"]
|
||||
q, k, v = torch.chunk(qkv_weight, 3, dim=0)
|
||||
q_bias, k_bias, v_bias = torch.chunk(qkv_bias, 3, dim=0)
|
||||
|
||||
block_mappings = {
|
||||
f"transformer_blocks.{i}.attn.to_q.weight": q,
|
||||
f"transformer_blocks.{i}.attn.to_q.bias": q_bias,
|
||||
f"transformer_blocks.{i}.attn.to_k.weight": k,
|
||||
f"transformer_blocks.{i}.attn.to_k.bias": k_bias,
|
||||
f"transformer_blocks.{i}.attn.to_v.weight": v,
|
||||
f"transformer_blocks.{i}.attn.to_v.bias": v_bias,
|
||||
# Output projections
|
||||
f"transformer_blocks.{i}.attn.to_out.0.weight": original_state_dict[
|
||||
f"transformer_blocks.{i}.attn.proj.weight"
|
||||
],
|
||||
f"transformer_blocks.{i}.attn.to_out.0.bias": original_state_dict[
|
||||
f"transformer_blocks.{i}.attn.proj.bias"
|
||||
],
|
||||
# Feed forward
|
||||
f"transformer_blocks.{i}.ff.net.0.proj.weight": original_state_dict[
|
||||
f"transformer_blocks.{i}.mlp.fc1.weight"
|
||||
],
|
||||
f"transformer_blocks.{i}.ff.net.0.proj.bias": original_state_dict[f"transformer_blocks.{i}.mlp.fc1.bias"],
|
||||
f"transformer_blocks.{i}.ff.net.2.weight": original_state_dict[f"transformer_blocks.{i}.mlp.fc2.weight"],
|
||||
f"transformer_blocks.{i}.ff.net.2.bias": original_state_dict[f"transformer_blocks.{i}.mlp.fc2.bias"],
|
||||
# Norms
|
||||
f"transformer_blocks.{i}.norm1.linear.weight": original_state_dict[
|
||||
f"transformer_blocks.{i}.adaLN_modulation.1.weight"
|
||||
],
|
||||
f"transformer_blocks.{i}.norm1.linear.bias": original_state_dict[
|
||||
f"transformer_blocks.{i}.adaLN_modulation.1.bias"
|
||||
],
|
||||
}
|
||||
converted_state_dict.update(block_mappings)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
|
||||
def main(args):
|
||||
original_ckpt = load_original_checkpoint(args)
|
||||
original_dtype = next(iter(original_ckpt.values())).dtype
|
||||
|
||||
# Initialize dtype with fp32 as default
|
||||
if args.dtype == "fp16":
|
||||
dtype = torch.float16
|
||||
elif args.dtype == "bf16":
|
||||
dtype = torch.bfloat16
|
||||
elif args.dtype == "fp32":
|
||||
dtype = torch.float32
|
||||
else:
|
||||
raise ValueError(f"Unsupported dtype: {args.dtype}. Must be one of: fp16, bf16, fp32")
|
||||
|
||||
if dtype != original_dtype:
|
||||
print(
|
||||
f"Converting checkpoint from {original_dtype} to {dtype}. This can lead to unexpected results, proceed with caution."
|
||||
)
|
||||
|
||||
converted_controlnet_state_dict = convert_sd3_controlnet_checkpoint_to_diffusers(original_ckpt)
|
||||
|
||||
controlnet = SD3ControlNetModel(
|
||||
patch_size=2,
|
||||
in_channels=16,
|
||||
num_layers=19,
|
||||
attention_head_dim=64,
|
||||
num_attention_heads=38,
|
||||
joint_attention_dim=None,
|
||||
caption_projection_dim=2048,
|
||||
pooled_projection_dim=2048,
|
||||
out_channels=16,
|
||||
pos_embed_max_size=None,
|
||||
pos_embed_type=None,
|
||||
use_pos_embed=False,
|
||||
force_zeros_for_pooled_projection=False,
|
||||
)
|
||||
|
||||
controlnet.load_state_dict(converted_controlnet_state_dict, strict=True)
|
||||
|
||||
print(f"Saving SD3 ControlNet in Diffusers format in {args.output_path}.")
|
||||
controlnet.to(dtype).save_pretrained(args.output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(args)
|
||||
@@ -11,7 +11,7 @@ from diffusers.models.modeling_utils import load_model_dict_into_meta
|
||||
from diffusers.utils.import_utils import is_accelerate_available
|
||||
|
||||
|
||||
CTX = init_empty_weights if is_accelerate_available() else nullcontext
|
||||
CTX = init_empty_weights if is_accelerate_available else nullcontext
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--checkpoint_path", type=str)
|
||||
|
||||
@@ -80,11 +80,9 @@ else:
|
||||
"AllegroTransformer3DModel",
|
||||
"AsymmetricAutoencoderKL",
|
||||
"AuraFlowTransformer2DModel",
|
||||
"AutoencoderDC",
|
||||
"AutoencoderKL",
|
||||
"AutoencoderKLAllegro",
|
||||
"AutoencoderKLCogVideoX",
|
||||
"AutoencoderKLLTXVideo",
|
||||
"AutoencoderKLMochi",
|
||||
"AutoencoderKLTemporalDecoder",
|
||||
"AutoencoderOobleck",
|
||||
@@ -93,7 +91,6 @@ else:
|
||||
"CogView3PlusTransformer2DModel",
|
||||
"ConsistencyDecoderVAE",
|
||||
"ControlNetModel",
|
||||
"ControlNetUnionModel",
|
||||
"ControlNetXSAdapter",
|
||||
"DiTTransformer2DModel",
|
||||
"FluxControlNetModel",
|
||||
@@ -105,16 +102,13 @@ else:
|
||||
"I2VGenXLUNet",
|
||||
"Kandinsky3UNet",
|
||||
"LatteTransformer3DModel",
|
||||
"LTXVideoTransformer3DModel",
|
||||
"LuminaNextDiT2DModel",
|
||||
"MochiTransformer3DModel",
|
||||
"ModelMixin",
|
||||
"MotionAdapter",
|
||||
"MultiAdapter",
|
||||
"MultiControlNetModel",
|
||||
"PixArtTransformer2DModel",
|
||||
"PriorTransformer",
|
||||
"SanaTransformer2DModel",
|
||||
"SD3ControlNetModel",
|
||||
"SD3MultiControlNetModel",
|
||||
"SD3Transformer2DModel",
|
||||
@@ -274,16 +268,12 @@ else:
|
||||
"CogVideoXVideoToVideoPipeline",
|
||||
"CogView3PlusPipeline",
|
||||
"CycleDiffusionPipeline",
|
||||
"FluxControlImg2ImgPipeline",
|
||||
"FluxControlNetImg2ImgPipeline",
|
||||
"FluxControlNetInpaintPipeline",
|
||||
"FluxControlNetPipeline",
|
||||
"FluxControlPipeline",
|
||||
"FluxFillPipeline",
|
||||
"FluxImg2ImgPipeline",
|
||||
"FluxInpaintPipeline",
|
||||
"FluxPipeline",
|
||||
"FluxPriorReduxPipeline",
|
||||
"HunyuanDiTControlNetPipeline",
|
||||
"HunyuanDiTPAGPipeline",
|
||||
"HunyuanDiTPipeline",
|
||||
@@ -320,8 +310,6 @@ else:
|
||||
"LDMTextToImagePipeline",
|
||||
"LEditsPPPipelineStableDiffusion",
|
||||
"LEditsPPPipelineStableDiffusionXL",
|
||||
"LTXImageToVideoPipeline",
|
||||
"LTXPipeline",
|
||||
"LuminaText2ImgPipeline",
|
||||
"MarigoldDepthPipeline",
|
||||
"MarigoldNormalsPipeline",
|
||||
@@ -332,9 +320,6 @@ else:
|
||||
"PixArtAlphaPipeline",
|
||||
"PixArtSigmaPAGPipeline",
|
||||
"PixArtSigmaPipeline",
|
||||
"ReduxImageEncoder",
|
||||
"SanaPAGPipeline",
|
||||
"SanaPipeline",
|
||||
"SemanticStableDiffusionPipeline",
|
||||
"ShapEImg2ImgPipeline",
|
||||
"ShapEPipeline",
|
||||
@@ -347,8 +332,6 @@ else:
|
||||
"StableDiffusion3ControlNetPipeline",
|
||||
"StableDiffusion3Img2ImgPipeline",
|
||||
"StableDiffusion3InpaintPipeline",
|
||||
"StableDiffusion3PAGImg2ImgPipeline",
|
||||
"StableDiffusion3PAGImg2ImgPipeline",
|
||||
"StableDiffusion3PAGPipeline",
|
||||
"StableDiffusion3Pipeline",
|
||||
"StableDiffusionAdapterPipeline",
|
||||
@@ -372,7 +355,6 @@ else:
|
||||
"StableDiffusionLDM3DPipeline",
|
||||
"StableDiffusionModelEditingPipeline",
|
||||
"StableDiffusionPAGImg2ImgPipeline",
|
||||
"StableDiffusionPAGInpaintPipeline",
|
||||
"StableDiffusionPAGPipeline",
|
||||
"StableDiffusionPanoramaPipeline",
|
||||
"StableDiffusionParadigmsPipeline",
|
||||
@@ -387,9 +369,6 @@ else:
|
||||
"StableDiffusionXLControlNetPAGImg2ImgPipeline",
|
||||
"StableDiffusionXLControlNetPAGPipeline",
|
||||
"StableDiffusionXLControlNetPipeline",
|
||||
"StableDiffusionXLControlNetUnionImg2ImgPipeline",
|
||||
"StableDiffusionXLControlNetUnionInpaintPipeline",
|
||||
"StableDiffusionXLControlNetUnionPipeline",
|
||||
"StableDiffusionXLControlNetXSPipeline",
|
||||
"StableDiffusionXLImg2ImgPipeline",
|
||||
"StableDiffusionXLInpaintPipeline",
|
||||
@@ -586,11 +565,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
AllegroTransformer3DModel,
|
||||
AsymmetricAutoencoderKL,
|
||||
AuraFlowTransformer2DModel,
|
||||
AutoencoderDC,
|
||||
AutoencoderKL,
|
||||
AutoencoderKLAllegro,
|
||||
AutoencoderKLCogVideoX,
|
||||
AutoencoderKLLTXVideo,
|
||||
AutoencoderKLMochi,
|
||||
AutoencoderKLTemporalDecoder,
|
||||
AutoencoderOobleck,
|
||||
@@ -599,7 +576,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
CogView3PlusTransformer2DModel,
|
||||
ConsistencyDecoderVAE,
|
||||
ControlNetModel,
|
||||
ControlNetUnionModel,
|
||||
ControlNetXSAdapter,
|
||||
DiTTransformer2DModel,
|
||||
FluxControlNetModel,
|
||||
@@ -611,16 +587,13 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
I2VGenXLUNet,
|
||||
Kandinsky3UNet,
|
||||
LatteTransformer3DModel,
|
||||
LTXVideoTransformer3DModel,
|
||||
LuminaNextDiT2DModel,
|
||||
MochiTransformer3DModel,
|
||||
ModelMixin,
|
||||
MotionAdapter,
|
||||
MultiAdapter,
|
||||
MultiControlNetModel,
|
||||
PixArtTransformer2DModel,
|
||||
PriorTransformer,
|
||||
SanaTransformer2DModel,
|
||||
SD3ControlNetModel,
|
||||
SD3MultiControlNetModel,
|
||||
SD3Transformer2DModel,
|
||||
@@ -759,16 +732,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
CogVideoXVideoToVideoPipeline,
|
||||
CogView3PlusPipeline,
|
||||
CycleDiffusionPipeline,
|
||||
FluxControlImg2ImgPipeline,
|
||||
FluxControlNetImg2ImgPipeline,
|
||||
FluxControlNetInpaintPipeline,
|
||||
FluxControlNetPipeline,
|
||||
FluxControlPipeline,
|
||||
FluxFillPipeline,
|
||||
FluxImg2ImgPipeline,
|
||||
FluxInpaintPipeline,
|
||||
FluxPipeline,
|
||||
FluxPriorReduxPipeline,
|
||||
HunyuanDiTControlNetPipeline,
|
||||
HunyuanDiTPAGPipeline,
|
||||
HunyuanDiTPipeline,
|
||||
@@ -805,8 +774,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
LDMTextToImagePipeline,
|
||||
LEditsPPPipelineStableDiffusion,
|
||||
LEditsPPPipelineStableDiffusionXL,
|
||||
LTXImageToVideoPipeline,
|
||||
LTXPipeline,
|
||||
LuminaText2ImgPipeline,
|
||||
MarigoldDepthPipeline,
|
||||
MarigoldNormalsPipeline,
|
||||
@@ -817,9 +784,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
PixArtAlphaPipeline,
|
||||
PixArtSigmaPAGPipeline,
|
||||
PixArtSigmaPipeline,
|
||||
ReduxImageEncoder,
|
||||
SanaPAGPipeline,
|
||||
SanaPipeline,
|
||||
SemanticStableDiffusionPipeline,
|
||||
ShapEImg2ImgPipeline,
|
||||
ShapEPipeline,
|
||||
@@ -831,7 +795,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
StableDiffusion3ControlNetPipeline,
|
||||
StableDiffusion3Img2ImgPipeline,
|
||||
StableDiffusion3InpaintPipeline,
|
||||
StableDiffusion3PAGImg2ImgPipeline,
|
||||
StableDiffusion3PAGPipeline,
|
||||
StableDiffusion3Pipeline,
|
||||
StableDiffusionAdapterPipeline,
|
||||
@@ -855,7 +818,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
StableDiffusionLDM3DPipeline,
|
||||
StableDiffusionModelEditingPipeline,
|
||||
StableDiffusionPAGImg2ImgPipeline,
|
||||
StableDiffusionPAGInpaintPipeline,
|
||||
StableDiffusionPAGPipeline,
|
||||
StableDiffusionPanoramaPipeline,
|
||||
StableDiffusionParadigmsPipeline,
|
||||
@@ -870,9 +832,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
StableDiffusionXLControlNetPAGImg2ImgPipeline,
|
||||
StableDiffusionXLControlNetPAGPipeline,
|
||||
StableDiffusionXLControlNetPipeline,
|
||||
StableDiffusionXLControlNetUnionImg2ImgPipeline,
|
||||
StableDiffusionXLControlNetUnionInpaintPipeline,
|
||||
StableDiffusionXLControlNetUnionPipeline,
|
||||
StableDiffusionXLControlNetXSPipeline,
|
||||
StableDiffusionXLImg2ImgPipeline,
|
||||
StableDiffusionXLInpaintPipeline,
|
||||
|
||||
@@ -170,7 +170,7 @@ class ConfigMixin:
|
||||
|
||||
if push_to_hub:
|
||||
commit_message = kwargs.pop("commit_message", None)
|
||||
private = kwargs.pop("private", None)
|
||||
private = kwargs.pop("private", False)
|
||||
create_pr = kwargs.pop("create_pr", False)
|
||||
token = kwargs.pop("token", None)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
|
||||
@@ -236,7 +236,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
`np.ndarray` or `torch.Tensor`:
|
||||
The denormalized image array.
|
||||
"""
|
||||
return (images * 0.5 + 0.5).clamp(0, 1)
|
||||
return (images / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
@staticmethod
|
||||
def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
|
||||
@@ -537,26 +537,6 @@ class VaeImageProcessor(ConfigMixin):
|
||||
|
||||
return image
|
||||
|
||||
def _denormalize_conditionally(
|
||||
self, images: torch.Tensor, do_denormalize: Optional[List[bool]] = None
|
||||
) -> torch.Tensor:
|
||||
r"""
|
||||
Denormalize a batch of images based on a condition list.
|
||||
|
||||
Args:
|
||||
images (`torch.Tensor`):
|
||||
The input image tensor.
|
||||
do_denormalize (`Optional[List[bool]`, *optional*, defaults to `None`):
|
||||
A list of booleans indicating whether to denormalize each image in the batch. If `None`, will use the
|
||||
value of `do_normalize` in the `VaeImageProcessor` config.
|
||||
"""
|
||||
if do_denormalize is None:
|
||||
return self.denormalize(images) if self.config.do_normalize else images
|
||||
|
||||
return torch.stack(
|
||||
[self.denormalize(images[i]) if do_denormalize[i] else images[i] for i in range(images.shape[0])]
|
||||
)
|
||||
|
||||
def get_default_height_width(
|
||||
self,
|
||||
image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
|
||||
@@ -772,7 +752,12 @@ class VaeImageProcessor(ConfigMixin):
|
||||
if output_type == "latent":
|
||||
return image
|
||||
|
||||
image = self._denormalize_conditionally(image, do_denormalize)
|
||||
if do_denormalize is None:
|
||||
do_denormalize = [self.config.do_normalize] * image.shape[0]
|
||||
|
||||
image = torch.stack(
|
||||
[self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
|
||||
)
|
||||
|
||||
if output_type == "pt":
|
||||
return image
|
||||
@@ -981,7 +966,12 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
|
||||
deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
|
||||
output_type = "np"
|
||||
|
||||
image = self._denormalize_conditionally(image, do_denormalize)
|
||||
if do_denormalize is None:
|
||||
do_denormalize = [self.config.do_normalize] * image.shape[0]
|
||||
|
||||
image = torch.stack(
|
||||
[self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
|
||||
)
|
||||
|
||||
image = self.pt_to_numpy(image)
|
||||
|
||||
|
||||
@@ -68,7 +68,6 @@ if is_torch_available():
|
||||
"LoraLoaderMixin",
|
||||
"FluxLoraLoaderMixin",
|
||||
"CogVideoXLoraLoaderMixin",
|
||||
"Mochi1LoraLoaderMixin",
|
||||
]
|
||||
_import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
|
||||
_import_structure["ip_adapter"] = ["IPAdapterMixin"]
|
||||
@@ -89,7 +88,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
CogVideoXLoraLoaderMixin,
|
||||
FluxLoraLoaderMixin,
|
||||
LoraLoaderMixin,
|
||||
Mochi1LoraLoaderMixin,
|
||||
SD3LoraLoaderMixin,
|
||||
StableDiffusionLoraLoaderMixin,
|
||||
StableDiffusionXLLoraLoaderMixin,
|
||||
|
||||
@@ -187,7 +187,7 @@ class IPAdapterMixin:
|
||||
state_dict = pretrained_model_name_or_path_or_dict
|
||||
|
||||
keys = list(state_dict.keys())
|
||||
if "image_proj" not in keys and "ip_adapter" not in keys:
|
||||
if keys != ["image_proj", "ip_adapter"]:
|
||||
raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
|
||||
|
||||
state_dicts.append(state_dict)
|
||||
|
||||
@@ -636,15 +636,10 @@ def _convert_xlabs_flux_lora_to_diffusers(old_state_dict):
|
||||
block_num = re.search(r"single_blocks\.(\d+)", old_key).group(1)
|
||||
new_key = f"transformer.single_transformer_blocks.{block_num}"
|
||||
|
||||
if "proj_lora" in old_key:
|
||||
if "proj_lora1" in old_key or "proj_lora2" in old_key:
|
||||
new_key += ".proj_out"
|
||||
elif "qkv_lora" in old_key and "up" not in old_key:
|
||||
handle_qkv(
|
||||
old_state_dict,
|
||||
new_state_dict,
|
||||
old_key,
|
||||
[f"transformer.single_transformer_blocks.{block_num}.norm.linear"],
|
||||
)
|
||||
elif "qkv_lora1" in old_key or "qkv_lora2" in old_key:
|
||||
new_key += ".norm.linear"
|
||||
|
||||
if "down" in old_key:
|
||||
new_key += ".lora_A.weight"
|
||||
@@ -663,309 +658,3 @@ def _convert_xlabs_flux_lora_to_diffusers(old_state_dict):
|
||||
raise ValueError(f"`old_state_dict` should be at this point but has: {list(old_state_dict.keys())}.")
|
||||
|
||||
return new_state_dict
|
||||
|
||||
|
||||
def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
|
||||
converted_state_dict = {}
|
||||
original_state_dict_keys = list(original_state_dict.keys())
|
||||
num_layers = 19
|
||||
num_single_layers = 38
|
||||
inner_dim = 3072
|
||||
mlp_ratio = 4.0
|
||||
|
||||
def swap_scale_shift(weight):
|
||||
shift, scale = weight.chunk(2, dim=0)
|
||||
new_weight = torch.cat([scale, shift], dim=0)
|
||||
return new_weight
|
||||
|
||||
for lora_key in ["lora_A", "lora_B"]:
|
||||
## time_text_embed.timestep_embedder <- time_in
|
||||
converted_state_dict[
|
||||
f"time_text_embed.timestep_embedder.linear_1.{lora_key}.weight"
|
||||
] = original_state_dict.pop(f"time_in.in_layer.{lora_key}.weight")
|
||||
if f"time_in.in_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[
|
||||
f"time_text_embed.timestep_embedder.linear_1.{lora_key}.bias"
|
||||
] = original_state_dict.pop(f"time_in.in_layer.{lora_key}.bias")
|
||||
|
||||
converted_state_dict[
|
||||
f"time_text_embed.timestep_embedder.linear_2.{lora_key}.weight"
|
||||
] = original_state_dict.pop(f"time_in.out_layer.{lora_key}.weight")
|
||||
if f"time_in.out_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[
|
||||
f"time_text_embed.timestep_embedder.linear_2.{lora_key}.bias"
|
||||
] = original_state_dict.pop(f"time_in.out_layer.{lora_key}.bias")
|
||||
|
||||
## time_text_embed.text_embedder <- vector_in
|
||||
converted_state_dict[f"time_text_embed.text_embedder.linear_1.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"vector_in.in_layer.{lora_key}.weight"
|
||||
)
|
||||
if f"vector_in.in_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"time_text_embed.text_embedder.linear_1.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"vector_in.in_layer.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"time_text_embed.text_embedder.linear_2.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"vector_in.out_layer.{lora_key}.weight"
|
||||
)
|
||||
if f"vector_in.out_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"time_text_embed.text_embedder.linear_2.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"vector_in.out_layer.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# guidance
|
||||
has_guidance = any("guidance" in k for k in original_state_dict)
|
||||
if has_guidance:
|
||||
converted_state_dict[
|
||||
f"time_text_embed.guidance_embedder.linear_1.{lora_key}.weight"
|
||||
] = original_state_dict.pop(f"guidance_in.in_layer.{lora_key}.weight")
|
||||
if f"guidance_in.in_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[
|
||||
f"time_text_embed.guidance_embedder.linear_1.{lora_key}.bias"
|
||||
] = original_state_dict.pop(f"guidance_in.in_layer.{lora_key}.bias")
|
||||
|
||||
converted_state_dict[
|
||||
f"time_text_embed.guidance_embedder.linear_2.{lora_key}.weight"
|
||||
] = original_state_dict.pop(f"guidance_in.out_layer.{lora_key}.weight")
|
||||
if f"guidance_in.out_layer.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[
|
||||
f"time_text_embed.guidance_embedder.linear_2.{lora_key}.bias"
|
||||
] = original_state_dict.pop(f"guidance_in.out_layer.{lora_key}.bias")
|
||||
|
||||
# context_embedder
|
||||
converted_state_dict[f"context_embedder.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"txt_in.{lora_key}.weight"
|
||||
)
|
||||
if f"txt_in.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"context_embedder.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"txt_in.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# x_embedder
|
||||
converted_state_dict[f"x_embedder.{lora_key}.weight"] = original_state_dict.pop(f"img_in.{lora_key}.weight")
|
||||
if f"img_in.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"x_embedder.{lora_key}.bias"] = original_state_dict.pop(f"img_in.{lora_key}.bias")
|
||||
|
||||
# double transformer blocks
|
||||
for i in range(num_layers):
|
||||
block_prefix = f"transformer_blocks.{i}."
|
||||
|
||||
for lora_key in ["lora_A", "lora_B"]:
|
||||
# norms
|
||||
converted_state_dict[f"{block_prefix}norm1.linear.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mod.lin.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.img_mod.lin.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}norm1.linear.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mod.lin.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"{block_prefix}norm1_context.linear.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mod.lin.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.txt_mod.lin.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}norm1_context.linear.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mod.lin.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# Q, K, V
|
||||
if lora_key == "lora_A":
|
||||
sample_lora_weight = original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.{lora_key}.weight")
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([sample_lora_weight])
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([sample_lora_weight])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([sample_lora_weight])
|
||||
|
||||
context_lora_weight = original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.{lora_key}.weight")
|
||||
converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.weight"] = torch.cat(
|
||||
[context_lora_weight]
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.weight"] = torch.cat(
|
||||
[context_lora_weight]
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.weight"] = torch.cat(
|
||||
[context_lora_weight]
|
||||
)
|
||||
else:
|
||||
sample_q, sample_k, sample_v = torch.chunk(
|
||||
original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.{lora_key}.weight"), 3, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([sample_q])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([sample_k])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([sample_v])
|
||||
|
||||
context_q, context_k, context_v = torch.chunk(
|
||||
original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.{lora_key}.weight"), 3, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.weight"] = torch.cat([context_q])
|
||||
converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.weight"] = torch.cat([context_k])
|
||||
converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.weight"] = torch.cat([context_v])
|
||||
|
||||
if f"double_blocks.{i}.img_attn.qkv.{lora_key}.bias" in original_state_dict_keys:
|
||||
sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
|
||||
original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.{lora_key}.bias"), 3, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([sample_q_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([sample_k_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([sample_v_bias])
|
||||
|
||||
if f"double_blocks.{i}.txt_attn.qkv.{lora_key}.bias" in original_state_dict_keys:
|
||||
context_q_bias, context_k_bias, context_v_bias = torch.chunk(
|
||||
original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.{lora_key}.bias"), 3, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.add_q_proj.{lora_key}.bias"] = torch.cat([context_q_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.add_k_proj.{lora_key}.bias"] = torch.cat([context_k_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.add_v_proj.{lora_key}.bias"] = torch.cat([context_v_bias])
|
||||
|
||||
# ff img_mlp
|
||||
converted_state_dict[f"{block_prefix}ff.net.0.proj.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mlp.0.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.img_mlp.0.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}ff.net.0.proj.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mlp.0.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"{block_prefix}ff.net.2.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mlp.2.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.img_mlp.2.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}ff.net.2.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_mlp.2.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"{block_prefix}ff_context.net.0.proj.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mlp.0.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.txt_mlp.0.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}ff_context.net.0.proj.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mlp.0.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"{block_prefix}ff_context.net.2.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mlp.2.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.txt_mlp.2.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}ff_context.net.2.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_mlp.2.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# output projections.
|
||||
converted_state_dict[f"{block_prefix}attn.to_out.0.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_attn.proj.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.img_attn.proj.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}attn.to_out.0.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_attn.proj.{lora_key}.bias"
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.to_add_out.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_attn.proj.{lora_key}.weight"
|
||||
)
|
||||
if f"double_blocks.{i}.txt_attn.proj.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}attn.to_add_out.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_attn.proj.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# qk_norm
|
||||
converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_attn.norm.query_norm.scale"
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.img_attn.norm.key_norm.scale"
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = original_state_dict.pop(
|
||||
f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
|
||||
for lora_key in ["lora_A", "lora_B"]:
|
||||
# norm.linear <- single_blocks.0.modulation.lin
|
||||
converted_state_dict[f"{block_prefix}norm.linear.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.modulation.lin.{lora_key}.weight"
|
||||
)
|
||||
if f"single_blocks.{i}.modulation.lin.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}norm.linear.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.modulation.lin.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# Q, K, V, mlp
|
||||
mlp_hidden_dim = int(inner_dim * mlp_ratio)
|
||||
split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
|
||||
|
||||
if lora_key == "lora_A":
|
||||
lora_weight = original_state_dict.pop(f"single_blocks.{i}.linear1.{lora_key}.weight")
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([lora_weight])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([lora_weight])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([lora_weight])
|
||||
converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.weight"] = torch.cat([lora_weight])
|
||||
|
||||
if f"single_blocks.{i}.linear1.{lora_key}.bias" in original_state_dict_keys:
|
||||
lora_bias = original_state_dict.pop(f"single_blocks.{i}.linear1.{lora_key}.bias")
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([lora_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([lora_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([lora_bias])
|
||||
converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.bias"] = torch.cat([lora_bias])
|
||||
else:
|
||||
q, k, v, mlp = torch.split(
|
||||
original_state_dict.pop(f"single_blocks.{i}.linear1.{lora_key}.weight"), split_size, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.weight"] = torch.cat([q])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.weight"] = torch.cat([k])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.weight"] = torch.cat([v])
|
||||
converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.weight"] = torch.cat([mlp])
|
||||
|
||||
if f"single_blocks.{i}.linear1.{lora_key}.bias" in original_state_dict_keys:
|
||||
q_bias, k_bias, v_bias, mlp_bias = torch.split(
|
||||
original_state_dict.pop(f"single_blocks.{i}.linear1.{lora_key}.bias"), split_size, dim=0
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.to_q.{lora_key}.bias"] = torch.cat([q_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_k.{lora_key}.bias"] = torch.cat([k_bias])
|
||||
converted_state_dict[f"{block_prefix}attn.to_v.{lora_key}.bias"] = torch.cat([v_bias])
|
||||
converted_state_dict[f"{block_prefix}proj_mlp.{lora_key}.bias"] = torch.cat([mlp_bias])
|
||||
|
||||
# output projections.
|
||||
converted_state_dict[f"{block_prefix}proj_out.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.linear2.{lora_key}.weight"
|
||||
)
|
||||
if f"single_blocks.{i}.linear2.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"{block_prefix}proj_out.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.linear2.{lora_key}.bias"
|
||||
)
|
||||
|
||||
# qk norm
|
||||
converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.norm.query_norm.scale"
|
||||
)
|
||||
converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
|
||||
f"single_blocks.{i}.norm.key_norm.scale"
|
||||
)
|
||||
|
||||
for lora_key in ["lora_A", "lora_B"]:
|
||||
converted_state_dict[f"proj_out.{lora_key}.weight"] = original_state_dict.pop(
|
||||
f"final_layer.linear.{lora_key}.weight"
|
||||
)
|
||||
if f"final_layer.linear.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"proj_out.{lora_key}.bias"] = original_state_dict.pop(
|
||||
f"final_layer.linear.{lora_key}.bias"
|
||||
)
|
||||
|
||||
converted_state_dict[f"norm_out.linear.{lora_key}.weight"] = swap_scale_shift(
|
||||
original_state_dict.pop(f"final_layer.adaLN_modulation.1.{lora_key}.weight")
|
||||
)
|
||||
if f"final_layer.adaLN_modulation.1.{lora_key}.bias" in original_state_dict_keys:
|
||||
converted_state_dict[f"norm_out.linear.{lora_key}.bias"] = swap_scale_shift(
|
||||
original_state_dict.pop(f"final_layer.adaLN_modulation.1.{lora_key}.bias")
|
||||
)
|
||||
|
||||
if len(original_state_dict) > 0:
|
||||
raise ValueError(f"`original_state_dict` should be empty at this point but has {original_state_dict.keys()=}.")
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
@@ -35,7 +34,6 @@ from ..utils import (
|
||||
)
|
||||
from .lora_base import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, LoraBaseMixin, _fetch_state_dict # noqa
|
||||
from .lora_conversion_utils import (
|
||||
_convert_bfl_flux_control_lora_to_diffusers,
|
||||
_convert_kohya_flux_lora_to_diffusers,
|
||||
_convert_non_diffusers_lora_to_diffusers,
|
||||
_convert_xlabs_flux_lora_to_diffusers,
|
||||
@@ -63,8 +61,6 @@ TEXT_ENCODER_NAME = "text_encoder"
|
||||
UNET_NAME = "unet"
|
||||
TRANSFORMER_NAME = "transformer"
|
||||
|
||||
_MODULE_NAME_TO_ATTRIBUTE_MAP_FLUX = {"x_embedder": "in_channels"}
|
||||
|
||||
|
||||
class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
|
||||
r"""
|
||||
@@ -412,7 +408,6 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -422,17 +417,6 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -955,7 +939,6 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -965,17 +948,6 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -1464,7 +1436,6 @@ class SD3LoraLoaderMixin(LoraBaseMixin):
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -1474,17 +1445,6 @@ class SD3LoraLoaderMixin(LoraBaseMixin):
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -1652,7 +1612,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
_lora_loadable_modules = ["transformer", "text_encoder"]
|
||||
transformer_name = TRANSFORMER_NAME
|
||||
text_encoder_name = TEXT_ENCODER_NAME
|
||||
_control_lora_supported_norm_keys = ["norm_q", "norm_k", "norm_added_q", "norm_added_k"]
|
||||
|
||||
@classmethod
|
||||
@validate_hf_hub_args
|
||||
@@ -1762,11 +1721,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
# xlabs doesn't use `alpha`.
|
||||
return (state_dict, None) if return_alphas else state_dict
|
||||
|
||||
is_bfl_control = any("query_norm.scale" in k for k in state_dict)
|
||||
if is_bfl_control:
|
||||
state_dict = _convert_bfl_flux_control_lora_to_diffusers(state_dict)
|
||||
return (state_dict, None) if return_alphas else state_dict
|
||||
|
||||
# For state dicts like
|
||||
# https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA
|
||||
keys = list(state_dict.keys())
|
||||
@@ -1833,54 +1787,23 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
pretrained_model_name_or_path_or_dict, return_alphas=True, **kwargs
|
||||
)
|
||||
|
||||
has_lora_keys = any("lora" in key for key in state_dict.keys())
|
||||
|
||||
# Flux Control LoRAs also have norm keys
|
||||
has_norm_keys = any(
|
||||
norm_key in key for key in state_dict.keys() for norm_key in self._control_lora_supported_norm_keys
|
||||
)
|
||||
|
||||
if not (has_lora_keys or has_norm_keys):
|
||||
is_correct_format = all("lora" in key for key in state_dict.keys())
|
||||
if not is_correct_format:
|
||||
raise ValueError("Invalid LoRA checkpoint.")
|
||||
|
||||
transformer_lora_state_dict = {
|
||||
k: state_dict.pop(k) for k in list(state_dict.keys()) if "transformer." in k and "lora" in k
|
||||
}
|
||||
transformer_norm_state_dict = {
|
||||
k: state_dict.pop(k)
|
||||
for k in list(state_dict.keys())
|
||||
if "transformer." in k and any(norm_key in k for norm_key in self._control_lora_supported_norm_keys)
|
||||
}
|
||||
|
||||
transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer
|
||||
has_param_with_expanded_shape = self._maybe_expand_transformer_param_shape_or_error_(
|
||||
transformer, transformer_lora_state_dict, transformer_norm_state_dict
|
||||
)
|
||||
|
||||
if has_param_with_expanded_shape:
|
||||
logger.info(
|
||||
"The LoRA weights contain parameters that have different shapes that expected by the transformer. "
|
||||
"As a result, the state_dict of the transformer has been expanded to match the LoRA parameter shapes. "
|
||||
"To get a comprehensive list of parameter names that were modified, enable debug logging."
|
||||
)
|
||||
|
||||
if len(transformer_lora_state_dict) > 0:
|
||||
transformer_state_dict = {k: v for k, v in state_dict.items() if "transformer." in k}
|
||||
if len(transformer_state_dict) > 0:
|
||||
self.load_lora_into_transformer(
|
||||
transformer_lora_state_dict,
|
||||
state_dict,
|
||||
network_alphas=network_alphas,
|
||||
transformer=transformer,
|
||||
transformer=getattr(self, self.transformer_name)
|
||||
if not hasattr(self, "transformer")
|
||||
else self.transformer,
|
||||
adapter_name=adapter_name,
|
||||
_pipeline=self,
|
||||
low_cpu_mem_usage=low_cpu_mem_usage,
|
||||
)
|
||||
|
||||
if len(transformer_norm_state_dict) > 0:
|
||||
transformer._transformer_norm_layers = self._load_norm_into_transformer(
|
||||
transformer_norm_state_dict,
|
||||
transformer=transformer,
|
||||
discard_original_layers=False,
|
||||
)
|
||||
|
||||
text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
|
||||
if len(text_encoder_state_dict) > 0:
|
||||
self.load_lora_into_text_encoder(
|
||||
@@ -1937,60 +1860,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
low_cpu_mem_usage=low_cpu_mem_usage,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _load_norm_into_transformer(
|
||||
cls,
|
||||
state_dict,
|
||||
transformer,
|
||||
prefix=None,
|
||||
discard_original_layers=False,
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
# Remove prefix if present
|
||||
prefix = prefix or cls.transformer_name
|
||||
for key in list(state_dict.keys()):
|
||||
if key.split(".")[0] == prefix:
|
||||
state_dict[key[len(f"{prefix}.") :]] = state_dict.pop(key)
|
||||
|
||||
# Find invalid keys
|
||||
transformer_state_dict = transformer.state_dict()
|
||||
transformer_keys = set(transformer_state_dict.keys())
|
||||
state_dict_keys = set(state_dict.keys())
|
||||
extra_keys = list(state_dict_keys - transformer_keys)
|
||||
|
||||
if extra_keys:
|
||||
logger.warning(
|
||||
f"Unsupported keys found in state dict when trying to load normalization layers into the transformer. The following keys will be ignored:\n{extra_keys}."
|
||||
)
|
||||
|
||||
for key in extra_keys:
|
||||
state_dict.pop(key)
|
||||
|
||||
# Save the layers that are going to be overwritten so that unload_lora_weights can work as expected
|
||||
overwritten_layers_state_dict = {}
|
||||
if not discard_original_layers:
|
||||
for key in state_dict.keys():
|
||||
overwritten_layers_state_dict[key] = transformer_state_dict[key].clone()
|
||||
|
||||
logger.info(
|
||||
"The provided state dict contains normalization layers in addition to LoRA layers. The normalization layers will directly update the state_dict of the transformer "
|
||||
'as opposed to the LoRA layers that will co-exist separately until the "fuse_lora()" method is called. That is to say, the normalization layers will always be directly '
|
||||
"fused into the transformer and can only be unfused if `discard_original_layers=True` is passed. This might also have implications when dealing with multiple LoRAs. "
|
||||
"If you notice something unexpected, please open an issue: https://github.com/huggingface/diffusers/issues."
|
||||
)
|
||||
|
||||
# We can't load with strict=True because the current state_dict does not contain all the transformer keys
|
||||
incompatible_keys = transformer.load_state_dict(state_dict, strict=False)
|
||||
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
||||
|
||||
# We shouldn't expect to see the supported norm keys here being present in the unexpected keys.
|
||||
if unexpected_keys:
|
||||
if any(norm_key in k for k in unexpected_keys for norm_key in cls._control_lora_supported_norm_keys):
|
||||
raise ValueError(
|
||||
f"Found {unexpected_keys} as unexpected keys while trying to load norm layers into the transformer."
|
||||
)
|
||||
|
||||
return overwritten_layers_state_dict
|
||||
|
||||
@classmethod
|
||||
# Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder
|
||||
def load_lora_into_text_encoder(
|
||||
@@ -2093,7 +1962,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -2103,17 +1971,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -2198,6 +2055,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
safe_serialization=safe_serialization,
|
||||
)
|
||||
|
||||
# Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.fuse_lora with unet->transformer
|
||||
def fuse_lora(
|
||||
self,
|
||||
components: List[str] = ["transformer", "text_encoder"],
|
||||
@@ -2237,19 +2095,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
pipeline.fuse_lora(lora_scale=0.7)
|
||||
```
|
||||
"""
|
||||
|
||||
transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer
|
||||
if (
|
||||
hasattr(transformer, "_transformer_norm_layers")
|
||||
and isinstance(transformer._transformer_norm_layers, dict)
|
||||
and len(transformer._transformer_norm_layers.keys()) > 0
|
||||
):
|
||||
logger.info(
|
||||
"The provided state dict contains normalization layers in addition to LoRA layers. The normalization layers will be directly updated the state_dict of the transformer "
|
||||
"as opposed to the LoRA layers that will co-exist separately until the 'fuse_lora()' method is called. That is to say, the normalization layers will always be directly "
|
||||
"fused into the transformer and can only be unfused if `discard_original_layers=True` is passed."
|
||||
)
|
||||
|
||||
super().fuse_lora(
|
||||
components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
|
||||
)
|
||||
@@ -2268,118 +2113,8 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
Args:
|
||||
components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
|
||||
"""
|
||||
transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer
|
||||
if hasattr(transformer, "_transformer_norm_layers") and transformer._transformer_norm_layers:
|
||||
transformer.load_state_dict(transformer._transformer_norm_layers, strict=False)
|
||||
|
||||
super().unfuse_lora(components=components)
|
||||
|
||||
# We override this here account for `_transformer_norm_layers`.
|
||||
def unload_lora_weights(self):
|
||||
super().unload_lora_weights()
|
||||
|
||||
transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer
|
||||
if hasattr(transformer, "_transformer_norm_layers") and transformer._transformer_norm_layers:
|
||||
transformer.load_state_dict(transformer._transformer_norm_layers, strict=False)
|
||||
transformer._transformer_norm_layers = None
|
||||
|
||||
@classmethod
|
||||
def _maybe_expand_transformer_param_shape_or_error_(
|
||||
cls,
|
||||
transformer: torch.nn.Module,
|
||||
lora_state_dict=None,
|
||||
norm_state_dict=None,
|
||||
prefix=None,
|
||||
) -> bool:
|
||||
"""
|
||||
Control LoRA expands the shape of the input layer from (3072, 64) to (3072, 128). This method handles that and
|
||||
generalizes things a bit so that any parameter that needs expansion receives appropriate treatement.
|
||||
"""
|
||||
state_dict = {}
|
||||
if lora_state_dict is not None:
|
||||
state_dict.update(lora_state_dict)
|
||||
if norm_state_dict is not None:
|
||||
state_dict.update(norm_state_dict)
|
||||
|
||||
# Remove prefix if present
|
||||
prefix = prefix or cls.transformer_name
|
||||
for key in list(state_dict.keys()):
|
||||
if key.split(".")[0] == prefix:
|
||||
state_dict[key[len(f"{prefix}.") :]] = state_dict.pop(key)
|
||||
|
||||
# Expand transformer parameter shapes if they don't match lora
|
||||
has_param_with_shape_update = False
|
||||
|
||||
for name, module in transformer.named_modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
module_weight = module.weight.data
|
||||
module_bias = module.bias.data if module.bias is not None else None
|
||||
bias = module_bias is not None
|
||||
|
||||
lora_A_weight_name = f"{name}.lora_A.weight"
|
||||
lora_B_weight_name = f"{name}.lora_B.weight"
|
||||
if lora_A_weight_name not in state_dict.keys():
|
||||
continue
|
||||
|
||||
in_features = state_dict[lora_A_weight_name].shape[1]
|
||||
out_features = state_dict[lora_B_weight_name].shape[0]
|
||||
|
||||
# This means there's no need for an expansion in the params, so we simply skip.
|
||||
if tuple(module_weight.shape) == (out_features, in_features):
|
||||
continue
|
||||
|
||||
module_out_features, module_in_features = module_weight.shape
|
||||
if out_features < module_out_features or in_features < module_in_features:
|
||||
raise NotImplementedError(
|
||||
f"Only LoRAs with input/output features higher than the current module's input/output features "
|
||||
f"are currently supported. The provided LoRA contains {in_features=} and {out_features=}, which "
|
||||
f"are lower than {module_in_features=} and {module_out_features=}. If you require support for "
|
||||
f"this please open an issue at https://github.com/huggingface/diffusers/issues."
|
||||
)
|
||||
|
||||
debug_message = (
|
||||
f'Expanding the nn.Linear input/output features for module="{name}" because the provided LoRA '
|
||||
f"checkpoint contains higher number of features than expected. The number of input_features will be "
|
||||
f"expanded from {module_in_features} to {in_features}"
|
||||
)
|
||||
if module_out_features != out_features:
|
||||
debug_message += (
|
||||
", and the number of output features will be "
|
||||
f"expanded from {module_out_features} to {out_features}."
|
||||
)
|
||||
else:
|
||||
debug_message += "."
|
||||
logger.debug(debug_message)
|
||||
|
||||
has_param_with_shape_update = True
|
||||
parent_module_name, _, current_module_name = name.rpartition(".")
|
||||
parent_module = transformer.get_submodule(parent_module_name)
|
||||
|
||||
# TODO: consider initializing this under meta device for optims.
|
||||
expanded_module = torch.nn.Linear(
|
||||
in_features, out_features, bias=bias, device=module_weight.device, dtype=module_weight.dtype
|
||||
)
|
||||
# Only weights are expanded and biases are not.
|
||||
new_weight = torch.zeros_like(
|
||||
expanded_module.weight.data, device=module_weight.device, dtype=module_weight.dtype
|
||||
)
|
||||
slices = tuple(slice(0, dim) for dim in module_weight.shape)
|
||||
new_weight[slices] = module_weight
|
||||
expanded_module.weight.data.copy_(new_weight)
|
||||
if module_bias is not None:
|
||||
expanded_module.bias.data.copy_(module_bias)
|
||||
|
||||
setattr(parent_module, current_module_name, expanded_module)
|
||||
|
||||
if current_module_name in _MODULE_NAME_TO_ATTRIBUTE_MAP_FLUX:
|
||||
attribute_name = _MODULE_NAME_TO_ATTRIBUTE_MAP_FLUX[current_module_name]
|
||||
new_value = int(expanded_module.weight.data.shape[1])
|
||||
old_value = getattr(transformer.config, attribute_name)
|
||||
setattr(transformer.config, attribute_name, new_value)
|
||||
logger.info(f"Set the {attribute_name} attribute of the model to {new_value} from {old_value}.")
|
||||
|
||||
return has_param_with_shape_update
|
||||
|
||||
|
||||
# The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially
|
||||
# relied on `StableDiffusionLoraLoaderMixin` for its LoRA support.
|
||||
@@ -2534,7 +2269,6 @@ class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin):
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -2544,17 +2278,6 @@ class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin):
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -2641,7 +2364,7 @@ class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin):
|
||||
|
||||
class CogVideoXLoraLoaderMixin(LoraBaseMixin):
|
||||
r"""
|
||||
Load LoRA layers into [`CogVideoXTransformer3DModel`]. Specific to [`CogVideoXPipeline`].
|
||||
Load LoRA layers into [`CogVideoXTransformer3DModel`]. Specific to [`CogVideoX`].
|
||||
"""
|
||||
|
||||
_lora_loadable_modules = ["transformer"]
|
||||
@@ -2946,314 +2669,6 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin):
|
||||
super().unfuse_lora(components=components)
|
||||
|
||||
|
||||
class Mochi1LoraLoaderMixin(LoraBaseMixin):
|
||||
r"""
|
||||
Load LoRA layers into [`MochiTransformer3DModel`]. Specific to [`MochiPipeline`].
|
||||
"""
|
||||
|
||||
_lora_loadable_modules = ["transformer"]
|
||||
transformer_name = TRANSFORMER_NAME
|
||||
|
||||
@classmethod
|
||||
@validate_hf_hub_args
|
||||
# Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.lora_state_dict
|
||||
def lora_state_dict(
|
||||
cls,
|
||||
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Return state dict for lora weights and the network alphas.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
We support loading A1111 formatted LoRA checkpoints in a limited capacity.
|
||||
|
||||
This function is experimental and might change in the future.
|
||||
|
||||
</Tip>
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
|
||||
Can be either:
|
||||
|
||||
- A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
|
||||
the Hub.
|
||||
- A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
|
||||
with [`ModelMixin.save_pretrained`].
|
||||
- A [torch state
|
||||
dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
|
||||
|
||||
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
||||
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
||||
is not used.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
|
||||
cached versions if they exist.
|
||||
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
||||
local_files_only (`bool`, *optional*, defaults to `False`):
|
||||
Whether to only load local model weights and configuration files or not. If set to `True`, the model
|
||||
won't be downloaded from the Hub.
|
||||
token (`str` or *bool*, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
|
||||
`diffusers-cli login` (stored in `~/.huggingface`) is used.
|
||||
revision (`str`, *optional*, defaults to `"main"`):
|
||||
The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
|
||||
allowed by Git.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
The subfolder location of a model file within a larger model repository on the Hub or locally.
|
||||
|
||||
"""
|
||||
# Load the main state dict first which has the LoRA layers for either of
|
||||
# transformer and text encoder or both.
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
local_files_only = kwargs.pop("local_files_only", None)
|
||||
token = kwargs.pop("token", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
weight_name = kwargs.pop("weight_name", None)
|
||||
use_safetensors = kwargs.pop("use_safetensors", None)
|
||||
|
||||
allow_pickle = False
|
||||
if use_safetensors is None:
|
||||
use_safetensors = True
|
||||
allow_pickle = True
|
||||
|
||||
user_agent = {
|
||||
"file_type": "attn_procs_weights",
|
||||
"framework": "pytorch",
|
||||
}
|
||||
|
||||
state_dict = _fetch_state_dict(
|
||||
pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
|
||||
weight_name=weight_name,
|
||||
use_safetensors=use_safetensors,
|
||||
local_files_only=local_files_only,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
user_agent=user_agent,
|
||||
allow_pickle=allow_pickle,
|
||||
)
|
||||
|
||||
is_dora_scale_present = any("dora_scale" in k for k in state_dict)
|
||||
if is_dora_scale_present:
|
||||
warn_msg = "It seems like you are using a DoRA checkpoint that is not compatible in Diffusers at the moment. So, we are going to filter out the keys associated to 'dora_scale` from the state dict. If you think this is a mistake please open an issue https://github.com/huggingface/diffusers/issues/new."
|
||||
logger.warning(warn_msg)
|
||||
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
|
||||
|
||||
return state_dict
|
||||
|
||||
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
|
||||
def load_lora_weights(
|
||||
self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
|
||||
`self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
|
||||
[`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
|
||||
See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
|
||||
dict is loaded into `self.transformer`.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
|
||||
See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
|
||||
adapter_name (`str`, *optional*):
|
||||
Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
|
||||
`default_{i}` where i is the total number of adapters being loaded.
|
||||
low_cpu_mem_usage (`bool`, *optional*):
|
||||
Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
|
||||
weights.
|
||||
kwargs (`dict`, *optional*):
|
||||
See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
|
||||
"""
|
||||
if not USE_PEFT_BACKEND:
|
||||
raise ValueError("PEFT backend is required for this method.")
|
||||
|
||||
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT_LORA)
|
||||
if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
|
||||
raise ValueError(
|
||||
"`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
|
||||
)
|
||||
|
||||
# if a dict is passed, copy it instead of modifying it inplace
|
||||
if isinstance(pretrained_model_name_or_path_or_dict, dict):
|
||||
pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
|
||||
|
||||
# First, ensure that the checkpoint is a compatible one and can be successfully loaded.
|
||||
state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
|
||||
|
||||
is_correct_format = all("lora" in key for key in state_dict.keys())
|
||||
if not is_correct_format:
|
||||
raise ValueError("Invalid LoRA checkpoint.")
|
||||
|
||||
self.load_lora_into_transformer(
|
||||
state_dict,
|
||||
transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
|
||||
adapter_name=adapter_name,
|
||||
_pipeline=self,
|
||||
low_cpu_mem_usage=low_cpu_mem_usage,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
# Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->CogVideoXTransformer3DModel
|
||||
def load_lora_into_transformer(
|
||||
cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False
|
||||
):
|
||||
"""
|
||||
This will load the LoRA layers specified in `state_dict` into `transformer`.
|
||||
|
||||
Parameters:
|
||||
state_dict (`dict`):
|
||||
A standard state dict containing the lora layer parameters. The keys can either be indexed directly
|
||||
into the unet or prefixed with an additional `unet` which can be used to distinguish between text
|
||||
encoder lora layers.
|
||||
transformer (`CogVideoXTransformer3DModel`):
|
||||
The Transformer model to load the LoRA layers into.
|
||||
adapter_name (`str`, *optional*):
|
||||
Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
|
||||
`default_{i}` where i is the total number of adapters being loaded.
|
||||
low_cpu_mem_usage (`bool`, *optional*):
|
||||
Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
|
||||
weights.
|
||||
"""
|
||||
if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
|
||||
raise ValueError(
|
||||
"`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
|
||||
)
|
||||
|
||||
# Load the layers corresponding to transformer.
|
||||
logger.info(f"Loading {cls.transformer_name}.")
|
||||
transformer.load_lora_adapter(
|
||||
state_dict,
|
||||
network_alphas=None,
|
||||
adapter_name=adapter_name,
|
||||
_pipeline=_pipeline,
|
||||
low_cpu_mem_usage=low_cpu_mem_usage,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.save_lora_weights
|
||||
def save_lora_weights(
|
||||
cls,
|
||||
save_directory: Union[str, os.PathLike],
|
||||
transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
|
||||
is_main_process: bool = True,
|
||||
weight_name: str = None,
|
||||
save_function: Callable = None,
|
||||
safe_serialization: bool = True,
|
||||
):
|
||||
r"""
|
||||
Save the LoRA parameters corresponding to the UNet and text encoder.
|
||||
|
||||
Arguments:
|
||||
save_directory (`str` or `os.PathLike`):
|
||||
Directory to save LoRA parameters to. Will be created if it doesn't exist.
|
||||
transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
|
||||
State dict of the LoRA layers corresponding to the `transformer`.
|
||||
is_main_process (`bool`, *optional*, defaults to `True`):
|
||||
Whether the process calling this is the main process or not. Useful during distributed training and you
|
||||
need to call this function on all processes. In this case, set `is_main_process=True` only on the main
|
||||
process to avoid race conditions.
|
||||
save_function (`Callable`):
|
||||
The function to use to save the state dictionary. Useful during distributed training when you need to
|
||||
replace `torch.save` with another method. Can be configured with the environment variable
|
||||
`DIFFUSERS_SAVE_MODE`.
|
||||
safe_serialization (`bool`, *optional*, defaults to `True`):
|
||||
Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
|
||||
"""
|
||||
state_dict = {}
|
||||
|
||||
if not transformer_lora_layers:
|
||||
raise ValueError("You must pass `transformer_lora_layers`.")
|
||||
|
||||
if transformer_lora_layers:
|
||||
state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name))
|
||||
|
||||
# Save the model
|
||||
cls.write_lora_layers(
|
||||
state_dict=state_dict,
|
||||
save_directory=save_directory,
|
||||
is_main_process=is_main_process,
|
||||
weight_name=weight_name,
|
||||
save_function=save_function,
|
||||
safe_serialization=safe_serialization,
|
||||
)
|
||||
|
||||
# Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.fuse_lora with unet->transformer
|
||||
def fuse_lora(
|
||||
self,
|
||||
components: List[str] = ["transformer", "text_encoder"],
|
||||
lora_scale: float = 1.0,
|
||||
safe_fusing: bool = False,
|
||||
adapter_names: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Fuses the LoRA parameters into the original parameters of the corresponding blocks.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This is an experimental API.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
|
||||
lora_scale (`float`, defaults to 1.0):
|
||||
Controls how much to influence the outputs with the LoRA parameters.
|
||||
safe_fusing (`bool`, defaults to `False`):
|
||||
Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipeline.fuse_lora(lora_scale=0.7)
|
||||
```
|
||||
"""
|
||||
super().fuse_lora(
|
||||
components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
# Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.unfuse_lora with unet->transformer
|
||||
def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], **kwargs):
|
||||
r"""
|
||||
Reverses the effect of
|
||||
[`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This is an experimental API.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
|
||||
unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
|
||||
unfuse_text_encoder (`bool`, defaults to `True`):
|
||||
Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
|
||||
LoRA parameters then it won't have any effect.
|
||||
"""
|
||||
super().unfuse_lora(components=components)
|
||||
|
||||
|
||||
class LoraLoaderMixin(StableDiffusionLoraLoaderMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "LoraLoaderMixin is deprecated and this will be removed in a future version. Please use `StableDiffusionLoraLoaderMixin`, instead."
|
||||
|
||||
@@ -52,61 +52,9 @@ _SET_ADAPTER_SCALE_FN_MAPPING = {
|
||||
"SD3Transformer2DModel": lambda model_cls, weights: weights,
|
||||
"FluxTransformer2DModel": lambda model_cls, weights: weights,
|
||||
"CogVideoXTransformer3DModel": lambda model_cls, weights: weights,
|
||||
"MochiTransformer3DModel": lambda model_cls, weights: weights,
|
||||
}
|
||||
|
||||
|
||||
def _maybe_adjust_config(config):
|
||||
"""
|
||||
We may run into some ambiguous configuration values when a model has module names, sharing a common prefix
|
||||
(`proj_out.weight` and `blocks.transformer.proj_out.weight`, for example) and they have different LoRA ranks. This
|
||||
method removes the ambiguity by following what is described here:
|
||||
https://github.com/huggingface/diffusers/pull/9985#issuecomment-2493840028.
|
||||
"""
|
||||
rank_pattern = config["rank_pattern"].copy()
|
||||
target_modules = config["target_modules"]
|
||||
original_r = config["r"]
|
||||
|
||||
for key in list(rank_pattern.keys()):
|
||||
key_rank = rank_pattern[key]
|
||||
|
||||
# try to detect ambiguity
|
||||
# `target_modules` can also be a str, in which case this loop would loop
|
||||
# over the chars of the str. The technically correct way to match LoRA keys
|
||||
# in PEFT is to use LoraModel._check_target_module_exists (lora_config, key).
|
||||
# But this cuts it for now.
|
||||
exact_matches = [mod for mod in target_modules if mod == key]
|
||||
substring_matches = [mod for mod in target_modules if key in mod and mod != key]
|
||||
ambiguous_key = key
|
||||
|
||||
if exact_matches and substring_matches:
|
||||
# if ambiguous we update the rank associated with the ambiguous key (`proj_out`, for example)
|
||||
config["r"] = key_rank
|
||||
# remove the ambiguous key from `rank_pattern` and update its rank to `r`, instead
|
||||
del config["rank_pattern"][key]
|
||||
for mod in substring_matches:
|
||||
# avoid overwriting if the module already has a specific rank
|
||||
if mod not in config["rank_pattern"]:
|
||||
config["rank_pattern"][mod] = original_r
|
||||
|
||||
# update the rest of the keys with the `original_r`
|
||||
for mod in target_modules:
|
||||
if mod != ambiguous_key and mod not in config["rank_pattern"]:
|
||||
config["rank_pattern"][mod] = original_r
|
||||
|
||||
# handle alphas to deal with cases like
|
||||
# https://github.com/huggingface/diffusers/pull/9999#issuecomment-2516180777
|
||||
has_different_ranks = len(config["rank_pattern"]) > 1 and list(config["rank_pattern"])[0] != config["r"]
|
||||
if has_different_ranks:
|
||||
config["lora_alpha"] = config["r"]
|
||||
alpha_pattern = {}
|
||||
for module_name, rank in config["rank_pattern"].items():
|
||||
alpha_pattern[module_name] = rank
|
||||
config["alpha_pattern"] = alpha_pattern
|
||||
|
||||
return config
|
||||
|
||||
|
||||
class PeftAdapterMixin:
|
||||
"""
|
||||
A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
|
||||
@@ -205,7 +153,6 @@ class PeftAdapterMixin:
|
||||
weights.
|
||||
"""
|
||||
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer
|
||||
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
@@ -268,9 +215,7 @@ class PeftAdapterMixin:
|
||||
|
||||
rank = {}
|
||||
for key, val in state_dict.items():
|
||||
# Cannot figure out rank from lora layers that don't have atleast 2 dimensions.
|
||||
# Bias layers in LoRA only have a single dimension
|
||||
if "lora_B" in key and val.ndim > 1:
|
||||
if "lora_B" in key:
|
||||
rank[key] = val.shape[1]
|
||||
|
||||
if network_alphas is not None and len(network_alphas) >= 1:
|
||||
@@ -278,8 +223,6 @@ class PeftAdapterMixin:
|
||||
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
|
||||
lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
|
||||
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
@@ -289,18 +232,8 @@ class PeftAdapterMixin:
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
|
||||
if "lora_bias" in lora_config_kwargs:
|
||||
if lora_config_kwargs["lora_bias"]:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.14.0 at least to use `lora_bias` in LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<=", "0.13.2"):
|
||||
lora_config_kwargs.pop("lora_bias")
|
||||
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
if adapter_name is None:
|
||||
adapter_name = get_adapter_name(self)
|
||||
@@ -317,22 +250,8 @@ class PeftAdapterMixin:
|
||||
if is_peft_version(">=", "0.13.1"):
|
||||
peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
|
||||
|
||||
# To handle scenarios where we cannot successfully set state dict. If it's unsucessful,
|
||||
# we should also delete the `peft_config` associated to the `adapter_name`.
|
||||
try:
|
||||
inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
|
||||
incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
|
||||
except RuntimeError as e:
|
||||
for module in self.modules():
|
||||
if isinstance(module, BaseTunerLayer):
|
||||
active_adapters = module.active_adapters
|
||||
for active_adapter in active_adapters:
|
||||
if adapter_name in active_adapter:
|
||||
module.delete_adapter(adapter_name)
|
||||
|
||||
self.peft_config.pop(adapter_name)
|
||||
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
|
||||
raise
|
||||
inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
|
||||
incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
|
||||
|
||||
warn_msg = ""
|
||||
if incompatible_keys is not None:
|
||||
|
||||
@@ -23,13 +23,10 @@ from ..utils import deprecate, is_accelerate_available, logging
|
||||
from .single_file_utils import (
|
||||
SingleFileComponentError,
|
||||
convert_animatediff_checkpoint_to_diffusers,
|
||||
convert_autoencoder_dc_checkpoint_to_diffusers,
|
||||
convert_controlnet_checkpoint,
|
||||
convert_flux_transformer_checkpoint_to_diffusers,
|
||||
convert_ldm_unet_checkpoint,
|
||||
convert_ldm_vae_checkpoint,
|
||||
convert_ltx_transformer_checkpoint_to_diffusers,
|
||||
convert_ltx_vae_checkpoint_to_diffusers,
|
||||
convert_sd3_transformer_checkpoint_to_diffusers,
|
||||
convert_stable_cascade_unet_single_file_to_diffusers,
|
||||
create_controlnet_diffusers_config_from_ldm,
|
||||
@@ -85,15 +82,6 @@ SINGLE_FILE_LOADABLE_CLASSES = {
|
||||
"checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
|
||||
"default_subfolder": "transformer",
|
||||
},
|
||||
"LTXVideoTransformer3DModel": {
|
||||
"checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
|
||||
"default_subfolder": "transformer",
|
||||
},
|
||||
"AutoencoderKLLTXVideo": {
|
||||
"checkpoint_mapping_fn": convert_ltx_vae_checkpoint_to_diffusers,
|
||||
"default_subfolder": "vae",
|
||||
},
|
||||
"AutoencoderDC": {"checkpoint_mapping_fn": convert_autoencoder_dc_checkpoint_to_diffusers},
|
||||
}
|
||||
|
||||
|
||||
@@ -231,7 +219,7 @@ class FromOriginalModelMixin:
|
||||
mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
|
||||
|
||||
checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"]
|
||||
if original_config is not None:
|
||||
if original_config:
|
||||
if "config_mapping_fn" in mapping_functions:
|
||||
config_mapping_fn = mapping_functions["config_mapping_fn"]
|
||||
else:
|
||||
@@ -255,7 +243,7 @@ class FromOriginalModelMixin:
|
||||
original_config=original_config, checkpoint=checkpoint, **config_mapping_kwargs
|
||||
)
|
||||
else:
|
||||
if config is not None:
|
||||
if config:
|
||||
if isinstance(config, str):
|
||||
default_pretrained_model_config_name = config
|
||||
else:
|
||||
@@ -281,8 +269,6 @@ class FromOriginalModelMixin:
|
||||
pretrained_model_name_or_path=default_pretrained_model_config_name,
|
||||
subfolder=subfolder,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
revision=revision,
|
||||
)
|
||||
expected_kwargs, optional_kwargs = cls._get_signature_keys(cls)
|
||||
|
||||
|
||||
@@ -62,14 +62,7 @@ CHECKPOINT_KEY_NAMES = {
|
||||
"xl_base": "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias",
|
||||
"xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
|
||||
"upscale": "model.diffusion_model.input_blocks.10.0.skip_connection.bias",
|
||||
"controlnet": [
|
||||
"control_model.time_embed.0.weight",
|
||||
"controlnet_cond_embedding.conv_in.weight",
|
||||
],
|
||||
# TODO: find non-Diffusers keys for controlnet_xl
|
||||
"controlnet_xl": "add_embedding.linear_1.weight",
|
||||
"controlnet_xl_large": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.weight",
|
||||
"controlnet_xl_mid": "down_blocks.1.attentions.0.norm.weight",
|
||||
"controlnet": "control_model.time_embed.0.weight",
|
||||
"playground-v2-5": "edm_mean",
|
||||
"inpainting": "model.diffusion_model.input_blocks.0.0.weight",
|
||||
"clip": "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight",
|
||||
@@ -92,14 +85,6 @@ CHECKPOINT_KEY_NAMES = {
|
||||
"double_blocks.0.img_attn.norm.key_norm.scale",
|
||||
"model.diffusion_model.double_blocks.0.img_attn.norm.key_norm.scale",
|
||||
],
|
||||
"ltx-video": [
|
||||
(
|
||||
"model.diffusion_model.patchify_proj.weight",
|
||||
"model.diffusion_model.transformer_blocks.27.scale_shift_table",
|
||||
),
|
||||
],
|
||||
"autoencoder-dc": "decoder.stages.1.op_list.0.main.conv.conv.bias",
|
||||
"autoencoder-dc-sana": "encoder.project_in.conv.bias",
|
||||
}
|
||||
|
||||
DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
@@ -111,9 +96,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
"inpainting": {"pretrained_model_name_or_path": "stable-diffusion-v1-5/stable-diffusion-inpainting"},
|
||||
"inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
|
||||
"controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
|
||||
"controlnet_xl_large": {"pretrained_model_name_or_path": "diffusers/controlnet-canny-sdxl-1.0"},
|
||||
"controlnet_xl_mid": {"pretrained_model_name_or_path": "diffusers/controlnet-canny-sdxl-1.0-mid"},
|
||||
"controlnet_xl_small": {"pretrained_model_name_or_path": "diffusers/controlnet-canny-sdxl-1.0-small"},
|
||||
"v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
|
||||
"v1": {"pretrained_model_name_or_path": "stable-diffusion-v1-5/stable-diffusion-v1-5"},
|
||||
"stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
|
||||
@@ -135,9 +117,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
"sd35_large": {
|
||||
"pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large",
|
||||
},
|
||||
"sd35_medium": {
|
||||
"pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-medium",
|
||||
},
|
||||
"animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
|
||||
"animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
|
||||
"animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},
|
||||
@@ -146,11 +125,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
"animatediff_rgb": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-rgb"},
|
||||
"flux-dev": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev"},
|
||||
"flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
|
||||
"ltx-video": {"pretrained_model_name_or_path": "Lightricks/LTX-Video"},
|
||||
"autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
|
||||
"autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
|
||||
"autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
|
||||
"autoencoder-dc-f32c32-sana": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"},
|
||||
}
|
||||
|
||||
# Use to configure model sample size when original config is provided
|
||||
@@ -507,16 +481,8 @@ def infer_diffusers_model_type(checkpoint):
|
||||
elif CHECKPOINT_KEY_NAMES["upscale"] in checkpoint:
|
||||
model_type = "upscale"
|
||||
|
||||
elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["controlnet"]):
|
||||
if CHECKPOINT_KEY_NAMES["controlnet_xl"] in checkpoint:
|
||||
if CHECKPOINT_KEY_NAMES["controlnet_xl_large"] in checkpoint:
|
||||
model_type = "controlnet_xl_large"
|
||||
elif CHECKPOINT_KEY_NAMES["controlnet_xl_mid"] in checkpoint:
|
||||
model_type = "controlnet_xl_mid"
|
||||
else:
|
||||
model_type = "controlnet_xl_small"
|
||||
else:
|
||||
model_type = "controlnet"
|
||||
elif CHECKPOINT_KEY_NAMES["controlnet"] in checkpoint:
|
||||
model_type = "controlnet"
|
||||
|
||||
elif (
|
||||
CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"] in checkpoint
|
||||
@@ -543,10 +509,7 @@ def infer_diffusers_model_type(checkpoint):
|
||||
model_type = "stable_cascade_stage_b"
|
||||
|
||||
elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216:
|
||||
if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864:
|
||||
model_type = "sd3"
|
||||
elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456:
|
||||
model_type = "sd35_medium"
|
||||
model_type = "sd3"
|
||||
|
||||
elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint:
|
||||
model_type = "sd35_large"
|
||||
@@ -577,26 +540,6 @@ def infer_diffusers_model_type(checkpoint):
|
||||
model_type = "flux-dev"
|
||||
else:
|
||||
model_type = "flux-schnell"
|
||||
|
||||
elif any(all(key in checkpoint for key in key_list) for key_list in CHECKPOINT_KEY_NAMES["ltx-video"]):
|
||||
model_type = "ltx-video"
|
||||
|
||||
elif CHECKPOINT_KEY_NAMES["autoencoder-dc"] in checkpoint:
|
||||
encoder_key = "encoder.project_in.conv.conv.bias"
|
||||
decoder_key = "decoder.project_in.main.conv.weight"
|
||||
|
||||
if CHECKPOINT_KEY_NAMES["autoencoder-dc-sana"] in checkpoint:
|
||||
model_type = "autoencoder-dc-f32c32-sana"
|
||||
|
||||
elif checkpoint[encoder_key].shape[-1] == 64 and checkpoint[decoder_key].shape[1] == 32:
|
||||
model_type = "autoencoder-dc-f32c32"
|
||||
|
||||
elif checkpoint[encoder_key].shape[-1] == 64 and checkpoint[decoder_key].shape[1] == 128:
|
||||
model_type = "autoencoder-dc-f64c128"
|
||||
|
||||
else:
|
||||
model_type = "autoencoder-dc-f128c512"
|
||||
|
||||
else:
|
||||
model_type = "v1"
|
||||
|
||||
@@ -1129,9 +1072,6 @@ def convert_controlnet_checkpoint(
|
||||
config,
|
||||
**kwargs,
|
||||
):
|
||||
# Return checkpoint if it's already been converted
|
||||
if "time_embedding.linear_1.weight" in checkpoint:
|
||||
return checkpoint
|
||||
# Some controlnet ckpt files are distributed independently from the rest of the
|
||||
# model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
|
||||
if "time_embed.0.weight" in checkpoint:
|
||||
@@ -2231,165 +2171,3 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
|
||||
def convert_ltx_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict = {
|
||||
key: checkpoint.pop(key) for key in list(checkpoint.keys()) if "model.diffusion_model." in key
|
||||
}
|
||||
|
||||
TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"model.diffusion_model.": "",
|
||||
"patchify_proj": "proj_in",
|
||||
"adaln_single": "time_embed",
|
||||
"q_norm": "norm_q",
|
||||
"k_norm": "norm_k",
|
||||
}
|
||||
|
||||
TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
new_key = key
|
||||
for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
converted_state_dict[new_key] = converted_state_dict.pop(key)
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, converted_state_dict)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
|
||||
def convert_ltx_vae_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys()) if "vae." in key}
|
||||
|
||||
def remove_keys_(key: str, state_dict):
|
||||
state_dict.pop(key)
|
||||
|
||||
VAE_KEYS_RENAME_DICT = {
|
||||
# common
|
||||
"vae.": "",
|
||||
# decoder
|
||||
"up_blocks.0": "mid_block",
|
||||
"up_blocks.1": "up_blocks.0",
|
||||
"up_blocks.2": "up_blocks.1.upsamplers.0",
|
||||
"up_blocks.3": "up_blocks.1",
|
||||
"up_blocks.4": "up_blocks.2.conv_in",
|
||||
"up_blocks.5": "up_blocks.2.upsamplers.0",
|
||||
"up_blocks.6": "up_blocks.2",
|
||||
"up_blocks.7": "up_blocks.3.conv_in",
|
||||
"up_blocks.8": "up_blocks.3.upsamplers.0",
|
||||
"up_blocks.9": "up_blocks.3",
|
||||
# encoder
|
||||
"down_blocks.0": "down_blocks.0",
|
||||
"down_blocks.1": "down_blocks.0.downsamplers.0",
|
||||
"down_blocks.2": "down_blocks.0.conv_out",
|
||||
"down_blocks.3": "down_blocks.1",
|
||||
"down_blocks.4": "down_blocks.1.downsamplers.0",
|
||||
"down_blocks.5": "down_blocks.1.conv_out",
|
||||
"down_blocks.6": "down_blocks.2",
|
||||
"down_blocks.7": "down_blocks.2.downsamplers.0",
|
||||
"down_blocks.8": "down_blocks.3",
|
||||
"down_blocks.9": "mid_block",
|
||||
# common
|
||||
"conv_shortcut": "conv_shortcut.conv",
|
||||
"res_blocks": "resnets",
|
||||
"norm3.norm": "norm3",
|
||||
"per_channel_statistics.mean-of-means": "latents_mean",
|
||||
"per_channel_statistics.std-of-means": "latents_std",
|
||||
}
|
||||
|
||||
VAE_SPECIAL_KEYS_REMAP = {
|
||||
"per_channel_statistics.channel": remove_keys_,
|
||||
"per_channel_statistics.mean-of-means": remove_keys_,
|
||||
"per_channel_statistics.mean-of-stds": remove_keys_,
|
||||
}
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
new_key = key
|
||||
for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
converted_state_dict[new_key] = converted_state_dict.pop(key)
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, converted_state_dict)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
|
||||
def convert_autoencoder_dc_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
|
||||
|
||||
def remap_qkv_(key: str, state_dict):
|
||||
qkv = state_dict.pop(key)
|
||||
q, k, v = torch.chunk(qkv, 3, dim=0)
|
||||
parent_module, _, _ = key.rpartition(".qkv.conv.weight")
|
||||
state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
|
||||
state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
|
||||
state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
|
||||
|
||||
def remap_proj_conv_(key: str, state_dict):
|
||||
parent_module, _, _ = key.rpartition(".proj.conv.weight")
|
||||
state_dict[f"{parent_module}.to_out.weight"] = state_dict.pop(key).squeeze()
|
||||
|
||||
AE_KEYS_RENAME_DICT = {
|
||||
# common
|
||||
"main.": "",
|
||||
"op_list.": "",
|
||||
"context_module": "attn",
|
||||
"local_module": "conv_out",
|
||||
# NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
|
||||
# If there were more scales, there would be more layers, so a loop would be better to handle this
|
||||
"aggreg.0.0": "to_qkv_multiscale.0.proj_in",
|
||||
"aggreg.0.1": "to_qkv_multiscale.0.proj_out",
|
||||
"depth_conv.conv": "conv_depth",
|
||||
"inverted_conv.conv": "conv_inverted",
|
||||
"point_conv.conv": "conv_point",
|
||||
"point_conv.norm": "norm",
|
||||
"conv.conv.": "conv.",
|
||||
"conv1.conv": "conv1",
|
||||
"conv2.conv": "conv2",
|
||||
"conv2.norm": "norm",
|
||||
"proj.norm": "norm_out",
|
||||
# encoder
|
||||
"encoder.project_in.conv": "encoder.conv_in",
|
||||
"encoder.project_out.0.conv": "encoder.conv_out",
|
||||
"encoder.stages": "encoder.down_blocks",
|
||||
# decoder
|
||||
"decoder.project_in.conv": "decoder.conv_in",
|
||||
"decoder.project_out.0": "decoder.norm_out",
|
||||
"decoder.project_out.2.conv": "decoder.conv_out",
|
||||
"decoder.stages": "decoder.up_blocks",
|
||||
}
|
||||
|
||||
AE_F32C32_F64C128_F128C512_KEYS = {
|
||||
"encoder.project_in.conv": "encoder.conv_in.conv",
|
||||
"decoder.project_out.2.conv": "decoder.conv_out.conv",
|
||||
}
|
||||
|
||||
AE_SPECIAL_KEYS_REMAP = {
|
||||
"qkv.conv.weight": remap_qkv_,
|
||||
"proj.conv.weight": remap_proj_conv_,
|
||||
}
|
||||
if "encoder.project_in.conv.bias" not in converted_state_dict:
|
||||
AE_KEYS_RENAME_DICT.update(AE_F32C32_F64C128_F128C512_KEYS)
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
new_key = key[:]
|
||||
for replace_key, rename_key in AE_KEYS_RENAME_DICT.items():
|
||||
new_key = new_key.replace(replace_key, rename_key)
|
||||
converted_state_dict[new_key] = converted_state_dict.pop(key)
|
||||
|
||||
for key in list(converted_state_dict.keys()):
|
||||
for special_key, handler_fn_inplace in AE_SPECIAL_KEYS_REMAP.items():
|
||||
if special_key not in key:
|
||||
continue
|
||||
handler_fn_inplace(key, converted_state_dict)
|
||||
|
||||
return converted_state_dict
|
||||
|
||||
@@ -492,9 +492,6 @@ class UNet2DConditionLoadersMixin:
|
||||
)
|
||||
state_dict = {k: v for k, v in state_dict.items() if isinstance(v, torch.Tensor)}
|
||||
else:
|
||||
deprecation_message = "Using the `save_attn_procs()` method has been deprecated and will be removed in a future version. Please use `save_lora_adapter()`."
|
||||
deprecate("save_attn_procs", "0.40.0", deprecation_message)
|
||||
|
||||
if not USE_PEFT_BACKEND:
|
||||
raise ValueError("PEFT backend is required for saving LoRAs using the `save_attn_procs()` method.")
|
||||
|
||||
|
||||
@@ -27,11 +27,9 @@ _import_structure = {}
|
||||
if is_torch_available():
|
||||
_import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
|
||||
_import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
|
||||
_import_structure["autoencoders.autoencoder_dc"] = ["AutoencoderDC"]
|
||||
_import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
|
||||
_import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
|
||||
_import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
|
||||
_import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
|
||||
_import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
|
||||
_import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
|
||||
_import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
|
||||
@@ -46,7 +44,6 @@ if is_torch_available():
|
||||
]
|
||||
_import_structure["controlnets.controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
|
||||
_import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
|
||||
_import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
|
||||
_import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
|
||||
_import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
|
||||
_import_structure["embeddings"] = ["ImageProjection"]
|
||||
@@ -60,14 +57,12 @@ if is_torch_available():
|
||||
_import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
|
||||
_import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
|
||||
_import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
|
||||
_import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
|
||||
_import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
|
||||
_import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
|
||||
_import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
|
||||
_import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
|
||||
_import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
|
||||
_import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
|
||||
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
|
||||
@@ -93,11 +88,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from .adapter import MultiAdapter, T2IAdapter
|
||||
from .autoencoders import (
|
||||
AsymmetricAutoencoderKL,
|
||||
AutoencoderDC,
|
||||
AutoencoderKL,
|
||||
AutoencoderKLAllegro,
|
||||
AutoencoderKLCogVideoX,
|
||||
AutoencoderKLLTXVideo,
|
||||
AutoencoderKLMochi,
|
||||
AutoencoderKLTemporalDecoder,
|
||||
AutoencoderOobleck,
|
||||
@@ -107,7 +100,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
)
|
||||
from .controlnets import (
|
||||
ControlNetModel,
|
||||
ControlNetUnionModel,
|
||||
ControlNetXSAdapter,
|
||||
FluxControlNetModel,
|
||||
FluxMultiControlNetModel,
|
||||
@@ -131,12 +123,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
FluxTransformer2DModel,
|
||||
HunyuanDiT2DModel,
|
||||
LatteTransformer3DModel,
|
||||
LTXVideoTransformer3DModel,
|
||||
LuminaNextDiT2DModel,
|
||||
MochiTransformer3DModel,
|
||||
PixArtTransformer2DModel,
|
||||
PriorTransformer,
|
||||
SanaTransformer2DModel,
|
||||
SD3Transformer2DModel,
|
||||
StableAudioDiTModel,
|
||||
T5FilmDecoder,
|
||||
|
||||
@@ -18,7 +18,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from ..utils import deprecate
|
||||
from ..utils.import_utils import is_torch_npu_available, is_torch_version
|
||||
from ..utils.import_utils import is_torch_npu_available
|
||||
|
||||
|
||||
if is_torch_npu_available():
|
||||
@@ -79,10 +79,10 @@ class GELU(nn.Module):
|
||||
self.approximate = approximate
|
||||
|
||||
def gelu(self, gate: torch.Tensor) -> torch.Tensor:
|
||||
if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
|
||||
# fp16 gelu not supported on mps before torch 2.0
|
||||
return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
|
||||
return F.gelu(gate, approximate=self.approximate)
|
||||
if gate.device.type != "mps":
|
||||
return F.gelu(gate, approximate=self.approximate)
|
||||
# mps: gelu is not implemented for float16
|
||||
return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.proj(hidden_states)
|
||||
@@ -105,10 +105,10 @@ class GEGLU(nn.Module):
|
||||
self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
|
||||
|
||||
def gelu(self, gate: torch.Tensor) -> torch.Tensor:
|
||||
if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
|
||||
# fp16 gelu not supported on mps before torch 2.0
|
||||
return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
|
||||
return F.gelu(gate)
|
||||
if gate.device.type != "mps":
|
||||
return F.gelu(gate)
|
||||
# mps: gelu is not implemented for float16
|
||||
return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
|
||||
|
||||
def forward(self, hidden_states, *args, **kwargs):
|
||||
if len(args) > 0 or kwargs.get("scale", None) is not None:
|
||||
|
||||
@@ -216,8 +216,8 @@ class FlaxAttention(nn.Module):
|
||||
hidden_states = jax_memory_efficient_attention(
|
||||
query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
|
||||
)
|
||||
|
||||
hidden_states = hidden_states.transpose(1, 0, 2)
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
else:
|
||||
# compute attentions
|
||||
if self.split_head_dim:
|
||||
|
||||
@@ -20,8 +20,8 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from ..image_processor import IPAdapterMaskProcessor
|
||||
from ..utils import deprecate, is_torch_xla_available, logging
|
||||
from ..utils.import_utils import is_torch_npu_available, is_torch_xla_version, is_xformers_available
|
||||
from ..utils import deprecate, logging
|
||||
from ..utils.import_utils import is_torch_npu_available, is_xformers_available
|
||||
from ..utils.torch_utils import is_torch_version, maybe_allow_in_graph
|
||||
|
||||
|
||||
@@ -36,15 +36,6 @@ if is_xformers_available():
|
||||
else:
|
||||
xformers = None
|
||||
|
||||
if is_torch_xla_available():
|
||||
# flash attention pallas kernel is introduced in the torch_xla 2.3 release.
|
||||
if is_torch_xla_version(">", "2.2"):
|
||||
from torch_xla.experimental.custom_kernel import flash_attention
|
||||
from torch_xla.runtime import is_spmd
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
|
||||
@maybe_allow_in_graph
|
||||
class Attention(nn.Module):
|
||||
@@ -199,16 +190,12 @@ class Attention(nn.Module):
|
||||
self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
|
||||
self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
|
||||
elif qk_norm == "layer_norm_across_heads":
|
||||
# Lumina applies qk norm across all heads
|
||||
# Lumina applys qk norm across all heads
|
||||
self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
|
||||
self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
|
||||
elif qk_norm == "rms_norm":
|
||||
self.norm_q = RMSNorm(dim_head, eps=eps)
|
||||
self.norm_k = RMSNorm(dim_head, eps=eps)
|
||||
elif qk_norm == "rms_norm_across_heads":
|
||||
# LTX applies qk norm across all heads
|
||||
self.norm_q = RMSNorm(dim_head * heads, eps=eps)
|
||||
self.norm_k = RMSNorm(dim_head * kv_heads, eps=eps)
|
||||
elif qk_norm == "l2":
|
||||
self.norm_q = LpNorm(p=2, dim=-1, eps=eps)
|
||||
self.norm_k = LpNorm(p=2, dim=-1, eps=eps)
|
||||
@@ -288,33 +275,6 @@ class Attention(nn.Module):
|
||||
)
|
||||
self.set_processor(processor)
|
||||
|
||||
def set_use_xla_flash_attention(
|
||||
self, use_xla_flash_attention: bool, partition_spec: Optional[Tuple[Optional[str], ...]] = None
|
||||
) -> None:
|
||||
r"""
|
||||
Set whether to use xla flash attention from `torch_xla` or not.
|
||||
|
||||
Args:
|
||||
use_xla_flash_attention (`bool`):
|
||||
Whether to use pallas flash attention kernel from `torch_xla` or not.
|
||||
partition_spec (`Tuple[]`, *optional*):
|
||||
Specify the partition specification if using SPMD. Otherwise None.
|
||||
"""
|
||||
if use_xla_flash_attention:
|
||||
if not is_torch_xla_available:
|
||||
raise "torch_xla is not available"
|
||||
elif is_torch_xla_version("<", "2.3"):
|
||||
raise "flash attention pallas kernel is supported from torch_xla version 2.3"
|
||||
elif is_spmd() and is_torch_xla_version("<", "2.4"):
|
||||
raise "flash attention pallas kernel using SPMD is supported from torch_xla version 2.4"
|
||||
else:
|
||||
processor = XLAFlashAttnProcessor2_0(partition_spec)
|
||||
else:
|
||||
processor = (
|
||||
AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
|
||||
)
|
||||
self.set_processor(processor)
|
||||
|
||||
def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
|
||||
r"""
|
||||
Set whether to use npu flash attention from `torch_npu` or not.
|
||||
@@ -362,14 +322,6 @@ class Attention(nn.Module):
|
||||
self.processor,
|
||||
(IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterXFormersAttnProcessor),
|
||||
)
|
||||
is_joint_processor = hasattr(self, "processor") and isinstance(
|
||||
self.processor,
|
||||
(
|
||||
JointAttnProcessor2_0,
|
||||
XFormersJointAttnProcessor,
|
||||
),
|
||||
)
|
||||
|
||||
if use_memory_efficient_attention_xformers:
|
||||
if is_added_kv_processor and is_custom_diffusion:
|
||||
raise NotImplementedError(
|
||||
@@ -432,8 +384,6 @@ class Attention(nn.Module):
|
||||
processor.to(
|
||||
device=self.processor.to_k_ip[0].weight.device, dtype=self.processor.to_k_ip[0].weight.dtype
|
||||
)
|
||||
elif is_joint_processor:
|
||||
processor = XFormersJointAttnProcessor(attention_op=attention_op)
|
||||
else:
|
||||
processor = XFormersAttnProcessor(attention_op=attention_op)
|
||||
else:
|
||||
@@ -802,98 +752,6 @@ class Attention(nn.Module):
|
||||
self.fused_projections = fuse
|
||||
|
||||
|
||||
class SanaMultiscaleAttentionProjection(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
num_attention_heads: int,
|
||||
kernel_size: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
channels = 3 * in_channels
|
||||
self.proj_in = nn.Conv2d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2,
|
||||
groups=channels,
|
||||
bias=False,
|
||||
)
|
||||
self.proj_out = nn.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
hidden_states = self.proj_in(hidden_states)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SanaMultiscaleLinearAttention(nn.Module):
|
||||
r"""Lightweight multi-scale linear attention"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
num_attention_heads: Optional[int] = None,
|
||||
attention_head_dim: int = 8,
|
||||
mult: float = 1.0,
|
||||
norm_type: str = "batch_norm",
|
||||
kernel_sizes: Tuple[int, ...] = (5,),
|
||||
eps: float = 1e-15,
|
||||
residual_connection: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
# To prevent circular import
|
||||
from .normalization import get_normalization
|
||||
|
||||
self.eps = eps
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.norm_type = norm_type
|
||||
self.residual_connection = residual_connection
|
||||
|
||||
num_attention_heads = (
|
||||
int(in_channels // attention_head_dim * mult) if num_attention_heads is None else num_attention_heads
|
||||
)
|
||||
inner_dim = num_attention_heads * attention_head_dim
|
||||
|
||||
self.to_q = nn.Linear(in_channels, inner_dim, bias=False)
|
||||
self.to_k = nn.Linear(in_channels, inner_dim, bias=False)
|
||||
self.to_v = nn.Linear(in_channels, inner_dim, bias=False)
|
||||
|
||||
self.to_qkv_multiscale = nn.ModuleList()
|
||||
for kernel_size in kernel_sizes:
|
||||
self.to_qkv_multiscale.append(
|
||||
SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
|
||||
)
|
||||
|
||||
self.nonlinearity = nn.ReLU()
|
||||
self.to_out = nn.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
|
||||
self.norm_out = get_normalization(norm_type, num_features=out_channels)
|
||||
|
||||
self.processor = SanaMultiscaleAttnProcessor2_0()
|
||||
|
||||
def apply_linear_attention(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
|
||||
value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1) # Adds padding
|
||||
scores = torch.matmul(value, key.transpose(-1, -2))
|
||||
hidden_states = torch.matmul(scores, query)
|
||||
|
||||
hidden_states = hidden_states.to(dtype=torch.float32)
|
||||
hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
|
||||
return hidden_states
|
||||
|
||||
def apply_quadratic_attention(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
|
||||
scores = torch.matmul(key.transpose(-1, -2), query)
|
||||
scores = scores.to(dtype=torch.float32)
|
||||
scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
|
||||
hidden_states = torch.matmul(value, scores)
|
||||
return hidden_states
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
return self.processor(self, hidden_states)
|
||||
|
||||
|
||||
class AttnProcessor:
|
||||
r"""
|
||||
Default processor for performing attention-related computations.
|
||||
@@ -1313,7 +1171,6 @@ class PAGJointAttnProcessor2_0:
|
||||
attn: Attention,
|
||||
hidden_states: torch.FloatTensor,
|
||||
encoder_hidden_states: torch.FloatTensor = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
residual = hidden_states
|
||||
|
||||
@@ -1699,91 +1556,6 @@ class FusedJointAttnProcessor2_0:
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class XFormersJointAttnProcessor:
|
||||
r"""
|
||||
Processor for implementing memory efficient attention using xFormers.
|
||||
|
||||
Args:
|
||||
attention_op (`Callable`, *optional*, defaults to `None`):
|
||||
The base
|
||||
[operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
|
||||
use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
|
||||
operator.
|
||||
"""
|
||||
|
||||
def __init__(self, attention_op: Optional[Callable] = None):
|
||||
self.attention_op = attention_op
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.FloatTensor,
|
||||
encoder_hidden_states: torch.FloatTensor = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> torch.FloatTensor:
|
||||
residual = hidden_states
|
||||
|
||||
# `sample` projections.
|
||||
query = attn.to_q(hidden_states)
|
||||
key = attn.to_k(hidden_states)
|
||||
value = attn.to_v(hidden_states)
|
||||
|
||||
query = attn.head_to_batch_dim(query).contiguous()
|
||||
key = attn.head_to_batch_dim(key).contiguous()
|
||||
value = attn.head_to_batch_dim(value).contiguous()
|
||||
|
||||
if attn.norm_q is not None:
|
||||
query = attn.norm_q(query)
|
||||
if attn.norm_k is not None:
|
||||
key = attn.norm_k(key)
|
||||
|
||||
# `context` projections.
|
||||
if encoder_hidden_states is not None:
|
||||
encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
|
||||
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
|
||||
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
|
||||
|
||||
encoder_hidden_states_query_proj = attn.head_to_batch_dim(encoder_hidden_states_query_proj).contiguous()
|
||||
encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj).contiguous()
|
||||
encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj).contiguous()
|
||||
|
||||
if attn.norm_added_q is not None:
|
||||
encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
|
||||
if attn.norm_added_k is not None:
|
||||
encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
|
||||
|
||||
query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
|
||||
key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
|
||||
value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
|
||||
|
||||
hidden_states = xformers.ops.memory_efficient_attention(
|
||||
query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
|
||||
)
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
if encoder_hidden_states is not None:
|
||||
# Split the attention outputs.
|
||||
hidden_states, encoder_hidden_states = (
|
||||
hidden_states[:, : residual.shape[1]],
|
||||
hidden_states[:, residual.shape[1] :],
|
||||
)
|
||||
if not attn.context_pre_only:
|
||||
encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if encoder_hidden_states is not None:
|
||||
return hidden_states, encoder_hidden_states
|
||||
else:
|
||||
return hidden_states
|
||||
|
||||
|
||||
class AllegroAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
|
||||
@@ -2135,9 +1907,7 @@ class FluxAttnProcessor2_0:
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
|
||||
hidden_states = F.scaled_dot_product_attention(
|
||||
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
|
||||
)
|
||||
hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
|
||||
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
|
||||
@@ -2980,122 +2750,6 @@ class AttnProcessor2_0:
|
||||
return hidden_states
|
||||
|
||||
|
||||
class XLAFlashAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product attention with pallas flash attention kernel if using `torch_xla`.
|
||||
"""
|
||||
|
||||
def __init__(self, partition_spec: Optional[Tuple[Optional[str], ...]] = None):
|
||||
if not hasattr(F, "scaled_dot_product_attention"):
|
||||
raise ImportError(
|
||||
"XLAFlashAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
|
||||
)
|
||||
if is_torch_xla_version("<", "2.3"):
|
||||
raise ImportError("XLA flash attention requires torch_xla version >= 2.3.")
|
||||
if is_spmd() and is_torch_xla_version("<", "2.4"):
|
||||
raise ImportError("SPMD support for XLA flash attention needs torch_xla version >= 2.4.")
|
||||
self.partition_spec = partition_spec
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
temb: Optional[torch.Tensor] = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
residual = hidden_states
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
input_ndim = hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = hidden_states.shape
|
||||
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
|
||||
|
||||
batch_size, sequence_length, _ = (
|
||||
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
||||
)
|
||||
|
||||
if attention_mask is not None:
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
||||
# scaled_dot_product_attention expects attention_mask shape to be
|
||||
# (batch, heads, source_length, target_length)
|
||||
attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
|
||||
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
|
||||
inner_dim = key.shape[-1]
|
||||
head_dim = inner_dim // attn.heads
|
||||
|
||||
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
|
||||
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
|
||||
if attn.norm_q is not None:
|
||||
query = attn.norm_q(query)
|
||||
if attn.norm_k is not None:
|
||||
key = attn.norm_k(key)
|
||||
|
||||
# the output of sdp = (batch, num_heads, seq_len, head_dim)
|
||||
# TODO: add support for attn.scale when we move to Torch 2.1
|
||||
if all(tensor.shape[2] >= 4096 for tensor in [query, key, value]):
|
||||
if attention_mask is not None:
|
||||
attention_mask = attention_mask.view(batch_size, 1, 1, attention_mask.shape[-1])
|
||||
# Convert mask to float and replace 0s with -inf and 1s with 0
|
||||
attention_mask = (
|
||||
attention_mask.float()
|
||||
.masked_fill(attention_mask == 0, float("-inf"))
|
||||
.masked_fill(attention_mask == 1, float(0.0))
|
||||
)
|
||||
|
||||
# Apply attention mask to key
|
||||
key = key + attention_mask
|
||||
query /= math.sqrt(query.shape[3])
|
||||
partition_spec = self.partition_spec if is_spmd() else None
|
||||
hidden_states = flash_attention(query, key, value, causal=False, partition_spec=partition_spec)
|
||||
else:
|
||||
logger.warning(
|
||||
"Unable to use the flash attention pallas kernel API call due to QKV sequence length < 4096."
|
||||
)
|
||||
hidden_states = F.scaled_dot_product_attention(
|
||||
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
|
||||
)
|
||||
|
||||
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class MochiVaeAttnProcessor2_0:
|
||||
r"""
|
||||
Attention processor used in Mochi VAE.
|
||||
@@ -5350,66 +5004,6 @@ class PAGCFGIdentitySelfAttnProcessor2_0:
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SanaMultiscaleAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing multiscale quadratic attention.
|
||||
"""
|
||||
|
||||
def __call__(self, attn: SanaMultiscaleLinearAttention, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
height, width = hidden_states.shape[-2:]
|
||||
if height * width > attn.attention_head_dim:
|
||||
use_linear_attention = True
|
||||
else:
|
||||
use_linear_attention = False
|
||||
|
||||
residual = hidden_states
|
||||
|
||||
batch_size, _, height, width = list(hidden_states.size())
|
||||
original_dtype = hidden_states.dtype
|
||||
|
||||
hidden_states = hidden_states.movedim(1, -1)
|
||||
query = attn.to_q(hidden_states)
|
||||
key = attn.to_k(hidden_states)
|
||||
value = attn.to_v(hidden_states)
|
||||
hidden_states = torch.cat([query, key, value], dim=3)
|
||||
hidden_states = hidden_states.movedim(-1, 1)
|
||||
|
||||
multi_scale_qkv = [hidden_states]
|
||||
for block in attn.to_qkv_multiscale:
|
||||
multi_scale_qkv.append(block(hidden_states))
|
||||
|
||||
hidden_states = torch.cat(multi_scale_qkv, dim=1)
|
||||
|
||||
if use_linear_attention:
|
||||
# for linear attention upcast hidden_states to float32
|
||||
hidden_states = hidden_states.to(dtype=torch.float32)
|
||||
|
||||
hidden_states = hidden_states.reshape(batch_size, -1, 3 * attn.attention_head_dim, height * width)
|
||||
|
||||
query, key, value = hidden_states.chunk(3, dim=2)
|
||||
query = attn.nonlinearity(query)
|
||||
key = attn.nonlinearity(key)
|
||||
|
||||
if use_linear_attention:
|
||||
hidden_states = attn.apply_linear_attention(query, key, value)
|
||||
hidden_states = hidden_states.to(dtype=original_dtype)
|
||||
else:
|
||||
hidden_states = attn.apply_quadratic_attention(query, key, value)
|
||||
|
||||
hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
|
||||
hidden_states = attn.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
|
||||
|
||||
if attn.norm_type == "rms_norm":
|
||||
hidden_states = attn.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
|
||||
else:
|
||||
hidden_states = attn.norm_out(hidden_states)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class LoRAAttnProcessor:
|
||||
def __init__(self):
|
||||
pass
|
||||
@@ -5441,165 +5035,6 @@ class FluxSingleAttnProcessor2_0(FluxAttnProcessor2_0):
|
||||
super().__init__()
|
||||
|
||||
|
||||
class SanaLinearAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product linear attention.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
original_dtype = hidden_states.dtype
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
|
||||
query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
|
||||
value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
|
||||
query = F.relu(query)
|
||||
key = F.relu(key)
|
||||
|
||||
query, key, value = query.float(), key.float(), value.float()
|
||||
|
||||
value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1.0)
|
||||
scores = torch.matmul(value, key)
|
||||
hidden_states = torch.matmul(scores, query)
|
||||
|
||||
hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + 1e-15)
|
||||
hidden_states = hidden_states.flatten(1, 2).transpose(1, 2)
|
||||
hidden_states = hidden_states.to(original_dtype)
|
||||
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if original_dtype == torch.float16:
|
||||
hidden_states = hidden_states.clip(-65504, 65504)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PAGCFGSanaLinearAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product linear attention.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
original_dtype = hidden_states.dtype
|
||||
|
||||
hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
|
||||
hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
|
||||
|
||||
query = attn.to_q(hidden_states_org)
|
||||
key = attn.to_k(hidden_states_org)
|
||||
value = attn.to_v(hidden_states_org)
|
||||
|
||||
query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
|
||||
value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
|
||||
query = F.relu(query)
|
||||
key = F.relu(key)
|
||||
|
||||
query, key, value = query.float(), key.float(), value.float()
|
||||
|
||||
value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1.0)
|
||||
scores = torch.matmul(value, key)
|
||||
hidden_states_org = torch.matmul(scores, query)
|
||||
|
||||
hidden_states_org = hidden_states_org[:, :, :-1] / (hidden_states_org[:, :, -1:] + 1e-15)
|
||||
hidden_states_org = hidden_states_org.flatten(1, 2).transpose(1, 2)
|
||||
hidden_states_org = hidden_states_org.to(original_dtype)
|
||||
|
||||
hidden_states_org = attn.to_out[0](hidden_states_org)
|
||||
hidden_states_org = attn.to_out[1](hidden_states_org)
|
||||
|
||||
# perturbed path (identity attention)
|
||||
hidden_states_ptb = attn.to_v(hidden_states_ptb).to(original_dtype)
|
||||
|
||||
hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
|
||||
hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
|
||||
|
||||
hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
|
||||
|
||||
if original_dtype == torch.float16:
|
||||
hidden_states = hidden_states.clip(-65504, 65504)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PAGIdentitySanaLinearAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product linear attention.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
original_dtype = hidden_states.dtype
|
||||
|
||||
hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
|
||||
|
||||
query = attn.to_q(hidden_states_org)
|
||||
key = attn.to_k(hidden_states_org)
|
||||
value = attn.to_v(hidden_states_org)
|
||||
|
||||
query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
|
||||
value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
|
||||
|
||||
query = F.relu(query)
|
||||
key = F.relu(key)
|
||||
|
||||
query, key, value = query.float(), key.float(), value.float()
|
||||
|
||||
value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1.0)
|
||||
scores = torch.matmul(value, key)
|
||||
hidden_states_org = torch.matmul(scores, query)
|
||||
|
||||
if hidden_states_org.dtype in [torch.float16, torch.bfloat16]:
|
||||
hidden_states_org = hidden_states_org.float()
|
||||
|
||||
hidden_states_org = hidden_states_org[:, :, :-1] / (hidden_states_org[:, :, -1:] + 1e-15)
|
||||
hidden_states_org = hidden_states_org.flatten(1, 2).transpose(1, 2)
|
||||
hidden_states_org = hidden_states_org.to(original_dtype)
|
||||
|
||||
hidden_states_org = attn.to_out[0](hidden_states_org)
|
||||
hidden_states_org = attn.to_out[1](hidden_states_org)
|
||||
|
||||
# perturbed path (identity attention)
|
||||
hidden_states_ptb = attn.to_v(hidden_states_ptb).to(original_dtype)
|
||||
|
||||
hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
|
||||
hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
|
||||
|
||||
hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
|
||||
|
||||
if original_dtype == torch.float16:
|
||||
hidden_states = hidden_states.clip(-65504, 65504)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
ADDED_KV_ATTENTION_PROCESSORS = (
|
||||
AttnAddedKVProcessor,
|
||||
SlicedAttnAddedKVProcessor,
|
||||
@@ -5636,7 +5071,6 @@ AttentionProcessor = Union[
|
||||
FusedCogVideoXAttnProcessor2_0,
|
||||
XFormersAttnAddedKVProcessor,
|
||||
XFormersAttnProcessor,
|
||||
XLAFlashAttnProcessor2_0,
|
||||
AttnProcessorNPU,
|
||||
AttnProcessor2_0,
|
||||
MochiVaeAttnProcessor2_0,
|
||||
@@ -5652,12 +5086,6 @@ AttentionProcessor = Union[
|
||||
CustomDiffusionAttnProcessor2_0,
|
||||
SlicedAttnProcessor,
|
||||
SlicedAttnAddedKVProcessor,
|
||||
SanaLinearAttnProcessor2_0,
|
||||
PAGCFGSanaLinearAttnProcessor2_0,
|
||||
PAGIdentitySanaLinearAttnProcessor2_0,
|
||||
SanaMultiscaleLinearAttention,
|
||||
SanaMultiscaleAttnProcessor2_0,
|
||||
SanaMultiscaleAttentionProjection,
|
||||
IPAdapterAttnProcessor,
|
||||
IPAdapterAttnProcessor2_0,
|
||||
IPAdapterXFormersAttnProcessor,
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
from .autoencoder_asym_kl import AsymmetricAutoencoderKL
|
||||
from .autoencoder_dc import AutoencoderDC
|
||||
from .autoencoder_kl import AutoencoderKL
|
||||
from .autoencoder_kl_allegro import AutoencoderKLAllegro
|
||||
from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
|
||||
from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
|
||||
from .autoencoder_kl_mochi import AutoencoderKLMochi
|
||||
from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
|
||||
from .autoencoder_oobleck import AutoencoderOobleck
|
||||
|
||||
@@ -1,620 +0,0 @@
|
||||
# Copyright 2024 MIT, Tsinghua University, NVIDIA CORPORATION and The HuggingFace Team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import FromOriginalModelMixin
|
||||
from ...utils.accelerate_utils import apply_forward_hook
|
||||
from ..activations import get_activation
|
||||
from ..attention_processor import SanaMultiscaleLinearAttention
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..normalization import RMSNorm, get_normalization
|
||||
from ..transformers.sana_transformer import GLUMBConv
|
||||
from .vae import DecoderOutput, EncoderOutput
|
||||
|
||||
|
||||
class ResBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
norm_type: str = "batch_norm",
|
||||
act_fn: str = "relu6",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.norm_type = norm_type
|
||||
|
||||
self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
|
||||
self.conv1 = nn.Conv2d(in_channels, in_channels, 3, 1, 1)
|
||||
self.conv2 = nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
|
||||
self.norm = get_normalization(norm_type, out_channels)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
residual = hidden_states
|
||||
hidden_states = self.conv1(hidden_states)
|
||||
hidden_states = self.nonlinearity(hidden_states)
|
||||
hidden_states = self.conv2(hidden_states)
|
||||
|
||||
if self.norm_type == "rms_norm":
|
||||
# move channel to the last dimension so we apply RMSnorm across channel dimension
|
||||
hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
|
||||
else:
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
return hidden_states + residual
|
||||
|
||||
|
||||
class EfficientViTBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
mult: float = 1.0,
|
||||
attention_head_dim: int = 32,
|
||||
qkv_multiscales: Tuple[int, ...] = (5,),
|
||||
norm_type: str = "batch_norm",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.attn = SanaMultiscaleLinearAttention(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
mult=mult,
|
||||
attention_head_dim=attention_head_dim,
|
||||
norm_type=norm_type,
|
||||
kernel_sizes=qkv_multiscales,
|
||||
residual_connection=True,
|
||||
)
|
||||
|
||||
self.conv_out = GLUMBConv(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
norm_type="rms_norm",
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x = self.attn(x)
|
||||
x = self.conv_out(x)
|
||||
return x
|
||||
|
||||
|
||||
def get_block(
|
||||
block_type: str,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
attention_head_dim: int,
|
||||
norm_type: str,
|
||||
act_fn: str,
|
||||
qkv_mutliscales: Tuple[int] = (),
|
||||
):
|
||||
if block_type == "ResBlock":
|
||||
block = ResBlock(in_channels, out_channels, norm_type, act_fn)
|
||||
|
||||
elif block_type == "EfficientViTBlock":
|
||||
block = EfficientViTBlock(
|
||||
in_channels, attention_head_dim=attention_head_dim, norm_type=norm_type, qkv_multiscales=qkv_mutliscales
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Block with {block_type=} is not supported.")
|
||||
|
||||
return block
|
||||
|
||||
|
||||
class DCDownBlock2d(nn.Module):
|
||||
def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.downsample = downsample
|
||||
self.factor = 2
|
||||
self.stride = 1 if downsample else 2
|
||||
self.group_size = in_channels * self.factor**2 // out_channels
|
||||
self.shortcut = shortcut
|
||||
|
||||
out_ratio = self.factor**2
|
||||
if downsample:
|
||||
assert out_channels % out_ratio == 0
|
||||
out_channels = out_channels // out_ratio
|
||||
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=self.stride,
|
||||
padding=1,
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
x = self.conv(hidden_states)
|
||||
if self.downsample:
|
||||
x = F.pixel_unshuffle(x, self.factor)
|
||||
|
||||
if self.shortcut:
|
||||
y = F.pixel_unshuffle(hidden_states, self.factor)
|
||||
y = y.unflatten(1, (-1, self.group_size))
|
||||
y = y.mean(dim=2)
|
||||
hidden_states = x + y
|
||||
else:
|
||||
hidden_states = x
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class DCUpBlock2d(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
interpolate: bool = False,
|
||||
shortcut: bool = True,
|
||||
interpolation_mode: str = "nearest",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.interpolate = interpolate
|
||||
self.interpolation_mode = interpolation_mode
|
||||
self.shortcut = shortcut
|
||||
self.factor = 2
|
||||
self.repeats = out_channels * self.factor**2 // in_channels
|
||||
|
||||
out_ratio = self.factor**2
|
||||
|
||||
if not interpolate:
|
||||
out_channels = out_channels * out_ratio
|
||||
|
||||
self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
if self.interpolate:
|
||||
x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
|
||||
x = self.conv(x)
|
||||
else:
|
||||
x = self.conv(hidden_states)
|
||||
x = F.pixel_shuffle(x, self.factor)
|
||||
|
||||
if self.shortcut:
|
||||
y = hidden_states.repeat_interleave(self.repeats, dim=1)
|
||||
y = F.pixel_shuffle(y, self.factor)
|
||||
hidden_states = x + y
|
||||
else:
|
||||
hidden_states = x
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
latent_channels: int,
|
||||
attention_head_dim: int = 32,
|
||||
block_type: Union[str, Tuple[str]] = "ResBlock",
|
||||
block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
|
||||
layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
|
||||
qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
|
||||
downsample_block_type: str = "pixel_unshuffle",
|
||||
out_shortcut: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
num_blocks = len(block_out_channels)
|
||||
|
||||
if isinstance(block_type, str):
|
||||
block_type = (block_type,) * num_blocks
|
||||
|
||||
if layers_per_block[0] > 0:
|
||||
self.conv_in = nn.Conv2d(
|
||||
in_channels,
|
||||
block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
)
|
||||
else:
|
||||
self.conv_in = DCDownBlock2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
|
||||
downsample=downsample_block_type == "pixel_unshuffle",
|
||||
shortcut=False,
|
||||
)
|
||||
|
||||
down_blocks = []
|
||||
for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
|
||||
down_block_list = []
|
||||
|
||||
for _ in range(num_layers):
|
||||
block = get_block(
|
||||
block_type[i],
|
||||
out_channel,
|
||||
out_channel,
|
||||
attention_head_dim=attention_head_dim,
|
||||
norm_type="rms_norm",
|
||||
act_fn="silu",
|
||||
qkv_mutliscales=qkv_multiscales[i],
|
||||
)
|
||||
down_block_list.append(block)
|
||||
|
||||
if i < num_blocks - 1 and num_layers > 0:
|
||||
downsample_block = DCDownBlock2d(
|
||||
in_channels=out_channel,
|
||||
out_channels=block_out_channels[i + 1],
|
||||
downsample=downsample_block_type == "pixel_unshuffle",
|
||||
shortcut=True,
|
||||
)
|
||||
down_block_list.append(downsample_block)
|
||||
|
||||
down_blocks.append(nn.Sequential(*down_block_list))
|
||||
|
||||
self.down_blocks = nn.ModuleList(down_blocks)
|
||||
|
||||
self.conv_out = nn.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
|
||||
|
||||
self.out_shortcut = out_shortcut
|
||||
if out_shortcut:
|
||||
self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
hidden_states = self.conv_in(hidden_states)
|
||||
for down_block in self.down_blocks:
|
||||
hidden_states = down_block(hidden_states)
|
||||
|
||||
if self.out_shortcut:
|
||||
x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
|
||||
x = x.mean(dim=2)
|
||||
hidden_states = self.conv_out(hidden_states) + x
|
||||
else:
|
||||
hidden_states = self.conv_out(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
latent_channels: int,
|
||||
attention_head_dim: int = 32,
|
||||
block_type: Union[str, Tuple[str]] = "ResBlock",
|
||||
block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
|
||||
layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
|
||||
qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
|
||||
norm_type: Union[str, Tuple[str]] = "rms_norm",
|
||||
act_fn: Union[str, Tuple[str]] = "silu",
|
||||
upsample_block_type: str = "pixel_shuffle",
|
||||
in_shortcut: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
num_blocks = len(block_out_channels)
|
||||
|
||||
if isinstance(block_type, str):
|
||||
block_type = (block_type,) * num_blocks
|
||||
if isinstance(norm_type, str):
|
||||
norm_type = (norm_type,) * num_blocks
|
||||
if isinstance(act_fn, str):
|
||||
act_fn = (act_fn,) * num_blocks
|
||||
|
||||
self.conv_in = nn.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
|
||||
|
||||
self.in_shortcut = in_shortcut
|
||||
if in_shortcut:
|
||||
self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
|
||||
|
||||
up_blocks = []
|
||||
for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
|
||||
up_block_list = []
|
||||
|
||||
if i < num_blocks - 1 and num_layers > 0:
|
||||
upsample_block = DCUpBlock2d(
|
||||
block_out_channels[i + 1],
|
||||
out_channel,
|
||||
interpolate=upsample_block_type == "interpolate",
|
||||
shortcut=True,
|
||||
)
|
||||
up_block_list.append(upsample_block)
|
||||
|
||||
for _ in range(num_layers):
|
||||
block = get_block(
|
||||
block_type[i],
|
||||
out_channel,
|
||||
out_channel,
|
||||
attention_head_dim=attention_head_dim,
|
||||
norm_type=norm_type[i],
|
||||
act_fn=act_fn[i],
|
||||
qkv_mutliscales=qkv_multiscales[i],
|
||||
)
|
||||
up_block_list.append(block)
|
||||
|
||||
up_blocks.insert(0, nn.Sequential(*up_block_list))
|
||||
|
||||
self.up_blocks = nn.ModuleList(up_blocks)
|
||||
|
||||
channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
|
||||
|
||||
self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
|
||||
self.conv_act = nn.ReLU()
|
||||
self.conv_out = None
|
||||
|
||||
if layers_per_block[0] > 0:
|
||||
self.conv_out = nn.Conv2d(channels, in_channels, 3, 1, 1)
|
||||
else:
|
||||
self.conv_out = DCUpBlock2d(
|
||||
channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
if self.in_shortcut:
|
||||
x = hidden_states.repeat_interleave(self.in_shortcut_repeats, dim=1)
|
||||
hidden_states = self.conv_in(hidden_states) + x
|
||||
else:
|
||||
hidden_states = self.conv_in(hidden_states)
|
||||
|
||||
for up_block in reversed(self.up_blocks):
|
||||
hidden_states = up_block(hidden_states)
|
||||
|
||||
hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
|
||||
hidden_states = self.conv_act(hidden_states)
|
||||
hidden_states = self.conv_out(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
r"""
|
||||
An Autoencoder model introduced in [DCAE](https://arxiv.org/abs/2410.10733) and used in
|
||||
[SANA](https://arxiv.org/abs/2410.10629).
|
||||
|
||||
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
|
||||
for all models (such as downloading or saving).
|
||||
|
||||
Args:
|
||||
in_channels (`int`, defaults to `3`):
|
||||
The number of input channels in samples.
|
||||
latent_channels (`int`, defaults to `32`):
|
||||
The number of channels in the latent space representation.
|
||||
encoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
|
||||
The type(s) of block to use in the encoder.
|
||||
decoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
|
||||
The type(s) of block to use in the decoder.
|
||||
encoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512, 1024, 1024)`):
|
||||
The number of output channels for each block in the encoder.
|
||||
decoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512, 1024, 1024)`):
|
||||
The number of output channels for each block in the decoder.
|
||||
encoder_layers_per_block (`Tuple[int]`, defaults to `(2, 2, 2, 3, 3, 3)`):
|
||||
The number of layers per block in the encoder.
|
||||
decoder_layers_per_block (`Tuple[int]`, defaults to `(3, 3, 3, 3, 3, 3)`):
|
||||
The number of layers per block in the decoder.
|
||||
encoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
|
||||
Multi-scale configurations for the encoder's QKV (query-key-value) transformations.
|
||||
decoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
|
||||
Multi-scale configurations for the decoder's QKV (query-key-value) transformations.
|
||||
upsample_block_type (`str`, defaults to `"pixel_shuffle"`):
|
||||
The type of block to use for upsampling in the decoder.
|
||||
downsample_block_type (`str`, defaults to `"pixel_unshuffle"`):
|
||||
The type of block to use for downsampling in the encoder.
|
||||
decoder_norm_types (`Union[str, Tuple[str]]`, defaults to `"rms_norm"`):
|
||||
The normalization type(s) to use in the decoder.
|
||||
decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
|
||||
The activation function(s) to use in the decoder.
|
||||
scaling_factor (`float`, defaults to `1.0`):
|
||||
The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
|
||||
space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
|
||||
z * scaling_factor` before being passed to the diffusion model. When decoding, the latents are scaled back
|
||||
to the original scale with the formula: `z = 1 / scaling_factor * z`.
|
||||
"""
|
||||
|
||||
_supports_gradient_checkpointing = False
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int = 3,
|
||||
latent_channels: int = 32,
|
||||
attention_head_dim: int = 32,
|
||||
encoder_block_types: Union[str, Tuple[str]] = "ResBlock",
|
||||
decoder_block_types: Union[str, Tuple[str]] = "ResBlock",
|
||||
encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
|
||||
decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
|
||||
encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3),
|
||||
decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3),
|
||||
encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
|
||||
decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
|
||||
upsample_block_type: str = "pixel_shuffle",
|
||||
downsample_block_type: str = "pixel_unshuffle",
|
||||
decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
|
||||
decoder_act_fns: Union[str, Tuple[str]] = "silu",
|
||||
scaling_factor: float = 1.0,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.encoder = Encoder(
|
||||
in_channels=in_channels,
|
||||
latent_channels=latent_channels,
|
||||
attention_head_dim=attention_head_dim,
|
||||
block_type=encoder_block_types,
|
||||
block_out_channels=encoder_block_out_channels,
|
||||
layers_per_block=encoder_layers_per_block,
|
||||
qkv_multiscales=encoder_qkv_multiscales,
|
||||
downsample_block_type=downsample_block_type,
|
||||
)
|
||||
self.decoder = Decoder(
|
||||
in_channels=in_channels,
|
||||
latent_channels=latent_channels,
|
||||
attention_head_dim=attention_head_dim,
|
||||
block_type=decoder_block_types,
|
||||
block_out_channels=decoder_block_out_channels,
|
||||
layers_per_block=decoder_layers_per_block,
|
||||
qkv_multiscales=decoder_qkv_multiscales,
|
||||
norm_type=decoder_norm_types,
|
||||
act_fn=decoder_act_fns,
|
||||
upsample_block_type=upsample_block_type,
|
||||
)
|
||||
|
||||
self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
|
||||
self.temporal_compression_ratio = 1
|
||||
|
||||
# When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
|
||||
# to perform decoding of a single video latent at a time.
|
||||
self.use_slicing = False
|
||||
|
||||
# When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
|
||||
# frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
|
||||
# intermediate tiles together, the memory requirement can be lowered.
|
||||
self.use_tiling = False
|
||||
|
||||
# The minimal tile height and width for spatial tiling to be used
|
||||
self.tile_sample_min_height = 512
|
||||
self.tile_sample_min_width = 512
|
||||
|
||||
# The minimal distance between two spatial tiles
|
||||
self.tile_sample_stride_height = 448
|
||||
self.tile_sample_stride_width = 448
|
||||
|
||||
def enable_tiling(
|
||||
self,
|
||||
tile_sample_min_height: Optional[int] = None,
|
||||
tile_sample_min_width: Optional[int] = None,
|
||||
tile_sample_stride_height: Optional[float] = None,
|
||||
tile_sample_stride_width: Optional[float] = None,
|
||||
) -> None:
|
||||
r"""
|
||||
Enable tiled AE decoding. When this option is enabled, the AE will split the input tensor into tiles to compute
|
||||
decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
|
||||
Args:
|
||||
tile_sample_min_height (`int`, *optional*):
|
||||
The minimum height required for a sample to be separated into tiles across the height dimension.
|
||||
tile_sample_min_width (`int`, *optional*):
|
||||
The minimum width required for a sample to be separated into tiles across the width dimension.
|
||||
tile_sample_stride_height (`int`, *optional*):
|
||||
The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
|
||||
no tiling artifacts produced across the height dimension.
|
||||
tile_sample_stride_width (`int`, *optional*):
|
||||
The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
|
||||
artifacts produced across the width dimension.
|
||||
"""
|
||||
self.use_tiling = True
|
||||
self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
|
||||
self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
|
||||
self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
|
||||
self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
|
||||
|
||||
def disable_tiling(self) -> None:
|
||||
r"""
|
||||
Disable tiled AE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.use_tiling = False
|
||||
|
||||
def enable_slicing(self) -> None:
|
||||
r"""
|
||||
Enable sliced AE decoding. When this option is enabled, the AE will split the input tensor in slices to compute
|
||||
decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.use_slicing = True
|
||||
|
||||
def disable_slicing(self) -> None:
|
||||
r"""
|
||||
Disable sliced AE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.use_slicing = False
|
||||
|
||||
def _encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, num_channels, height, width = x.shape
|
||||
|
||||
if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
|
||||
return self.tiled_encode(x, return_dict=False)[0]
|
||||
|
||||
encoded = self.encoder(x)
|
||||
|
||||
return encoded
|
||||
|
||||
@apply_forward_hook
|
||||
def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[EncoderOutput, Tuple[torch.Tensor]]:
|
||||
r"""
|
||||
Encode a batch of images into latents.
|
||||
|
||||
Args:
|
||||
x (`torch.Tensor`): Input batch of images.
|
||||
return_dict (`bool`, defaults to `True`):
|
||||
Whether to return a [`~models.vae.EncoderOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
The latent representations of the encoded videos. If `return_dict` is True, a
|
||||
[`~models.vae.EncoderOutput`] is returned, otherwise a plain `tuple` is returned.
|
||||
"""
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
|
||||
encoded = torch.cat(encoded_slices)
|
||||
else:
|
||||
encoded = self._encode(x)
|
||||
|
||||
if not return_dict:
|
||||
return (encoded,)
|
||||
return EncoderOutput(latent=encoded)
|
||||
|
||||
def _decode(self, z: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, num_channels, height, width = z.shape
|
||||
|
||||
if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
|
||||
return self.tiled_decode(z, return_dict=False)[0]
|
||||
|
||||
decoded = self.decoder(z)
|
||||
|
||||
return decoded
|
||||
|
||||
@apply_forward_hook
|
||||
def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
|
||||
r"""
|
||||
Decode a batch of images.
|
||||
|
||||
Args:
|
||||
z (`torch.Tensor`): Input batch of latent vectors.
|
||||
return_dict (`bool`, defaults to `True`):
|
||||
Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.vae.DecoderOutput`] or `tuple`:
|
||||
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
|
||||
returned.
|
||||
"""
|
||||
if self.use_slicing and z.size(0) > 1:
|
||||
decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
|
||||
decoded = torch.cat(decoded_slices)
|
||||
else:
|
||||
decoded = self._decode(z)
|
||||
|
||||
if not return_dict:
|
||||
return (decoded,)
|
||||
return DecoderOutput(sample=decoded)
|
||||
|
||||
def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
|
||||
raise NotImplementedError("`tiled_encode` has not been implemented for AutoencoderDC.")
|
||||
|
||||
def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
|
||||
raise NotImplementedError("`tiled_decode` has not been implemented for AutoencoderDC.")
|
||||
|
||||
def forward(self, sample: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
|
||||
encoded = self.encode(sample, return_dict=False)[0]
|
||||
decoded = self.decode(encoded, return_dict=False)[0]
|
||||
if not return_dict:
|
||||
return (decoded,)
|
||||
return DecoderOutput(sample=decoded)
|
||||
@@ -17,7 +17,6 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import PeftAdapterMixin
|
||||
from ...loaders.single_file_model import FromOriginalModelMixin
|
||||
from ...utils import deprecate
|
||||
from ...utils.accelerate_utils import apply_forward_hook
|
||||
@@ -35,7 +34,7 @@ from ..modeling_utils import ModelMixin
|
||||
from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
|
||||
|
||||
|
||||
class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
|
||||
class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
r"""
|
||||
A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
|
||||
|
||||
|
||||
@@ -433,7 +433,7 @@ class CogVideoXDownBlock3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
zq,
|
||||
conv_cache.get(conv_cache_key),
|
||||
conv_cache=conv_cache.get(conv_cache_key),
|
||||
)
|
||||
else:
|
||||
hidden_states, new_conv_cache[conv_cache_key] = resnet(
|
||||
@@ -531,7 +531,7 @@ class CogVideoXMidBlock3D(nn.Module):
|
||||
return create_forward
|
||||
|
||||
hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
|
||||
create_custom_forward(resnet), hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
|
||||
)
|
||||
else:
|
||||
hidden_states, new_conv_cache[conv_cache_key] = resnet(
|
||||
@@ -649,7 +649,7 @@ class CogVideoXUpBlock3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
zq,
|
||||
conv_cache.get(conv_cache_key),
|
||||
conv_cache=conv_cache.get(conv_cache_key),
|
||||
)
|
||||
else:
|
||||
hidden_states, new_conv_cache[conv_cache_key] = resnet(
|
||||
@@ -789,7 +789,7 @@ class CogVideoXEncoder3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
None,
|
||||
conv_cache.get(conv_cache_key),
|
||||
conv_cache=conv_cache.get(conv_cache_key),
|
||||
)
|
||||
|
||||
# 2. Mid
|
||||
@@ -798,14 +798,14 @@ class CogVideoXEncoder3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
None,
|
||||
conv_cache.get("mid_block"),
|
||||
conv_cache=conv_cache.get("mid_block"),
|
||||
)
|
||||
else:
|
||||
# 1. Down
|
||||
for i, down_block in enumerate(self.down_blocks):
|
||||
conv_cache_key = f"down_block_{i}"
|
||||
hidden_states, new_conv_cache[conv_cache_key] = down_block(
|
||||
hidden_states, temb, None, conv_cache.get(conv_cache_key)
|
||||
hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key)
|
||||
)
|
||||
|
||||
# 2. Mid
|
||||
@@ -953,7 +953,7 @@ class CogVideoXDecoder3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
sample,
|
||||
conv_cache.get("mid_block"),
|
||||
conv_cache=conv_cache.get("mid_block"),
|
||||
)
|
||||
|
||||
# 2. Up
|
||||
@@ -964,7 +964,7 @@ class CogVideoXDecoder3D(nn.Module):
|
||||
hidden_states,
|
||||
temb,
|
||||
sample,
|
||||
conv_cache.get(conv_cache_key),
|
||||
conv_cache=conv_cache.get(conv_cache_key),
|
||||
)
|
||||
else:
|
||||
# 1. Mid
|
||||
@@ -1476,7 +1476,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
z = posterior.sample(generator=generator)
|
||||
else:
|
||||
z = posterior.mode()
|
||||
dec = self.decode(z).sample
|
||||
dec = self.decode(z)
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
return DecoderOutput(sample=dec)
|
||||
return dec
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -437,8 +437,7 @@ class FourierFeatures(nn.Module):
|
||||
|
||||
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
||||
r"""Forward method of the `FourierFeatures` class."""
|
||||
original_dtype = inputs.dtype
|
||||
inputs = inputs.to(torch.float32)
|
||||
|
||||
num_channels = inputs.shape[1]
|
||||
num_freqs = (self.stop - self.start) // self.step
|
||||
|
||||
@@ -451,7 +450,7 @@ class FourierFeatures(nn.Module):
|
||||
# Scale channels by frequency.
|
||||
h = w * h
|
||||
|
||||
return torch.cat([inputs, torch.sin(h), torch.cos(h)], dim=1).to(original_dtype)
|
||||
return torch.cat([inputs, torch.sin(h), torch.cos(h)], dim=1)
|
||||
|
||||
|
||||
class MochiEncoder3D(nn.Module):
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import itertools
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
@@ -95,7 +94,7 @@ class TemporalDecoder(nn.Module):
|
||||
|
||||
sample = self.conv_in(sample)
|
||||
|
||||
upscale_dtype = next(itertools.chain(self.up_blocks.parameters(), self.up_blocks.buffers())).dtype
|
||||
upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
|
||||
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
||||
|
||||
def create_custom_forward(module):
|
||||
@@ -229,6 +228,14 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
|
||||
|
||||
self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
|
||||
|
||||
sample_size = (
|
||||
self.config.sample_size[0]
|
||||
if isinstance(self.config.sample_size, (list, tuple))
|
||||
else self.config.sample_size
|
||||
)
|
||||
self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
|
||||
self.tile_overlap_factor = 0.25
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, (Encoder, TemporalDecoder)):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
@@ -310,9 +310,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
|
||||
self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
|
||||
) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
output = [
|
||||
self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x_slice) for x_slice in x.split(1)
|
||||
]
|
||||
output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
|
||||
output = torch.cat(output)
|
||||
else:
|
||||
output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
|
||||
@@ -343,7 +341,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
|
||||
# as if we were loading the latents from an RGBA uint8 image.
|
||||
unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
|
||||
|
||||
dec = self.decode(unscaled_enc).sample
|
||||
dec = self.decode(unscaled_enc)
|
||||
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
|
||||
@@ -30,19 +30,6 @@ from ..unets.unet_2d_blocks import (
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EncoderOutput(BaseOutput):
|
||||
r"""
|
||||
Output of encoding method.
|
||||
|
||||
Args:
|
||||
latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`):
|
||||
The encoded latent.
|
||||
"""
|
||||
|
||||
latent: torch.Tensor
|
||||
|
||||
|
||||
@dataclass
|
||||
class DecoderOutput(BaseOutput):
|
||||
r"""
|
||||
|
||||
@@ -11,10 +11,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
from ..utils import deprecate
|
||||
from .controlnets.controlnet import ( # noqa
|
||||
BaseOutput,
|
||||
ControlNetConditioningEmbedding,
|
||||
ControlNetModel,
|
||||
ControlNetOutput,
|
||||
@@ -25,91 +24,19 @@ from .controlnets.controlnet import ( # noqa
|
||||
class ControlNetOutput(ControlNetOutput):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `ControlNetOutput` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetOutput`, instead."
|
||||
deprecate("diffusers.models.controlnet.ControlNetOutput", "0.34", deprecation_message)
|
||||
deprecate("ControlNetOutput", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class ControlNetModel(ControlNetModel):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int = 4,
|
||||
conditioning_channels: int = 3,
|
||||
flip_sin_to_cos: bool = True,
|
||||
freq_shift: int = 0,
|
||||
down_block_types: Tuple[str, ...] = (
|
||||
"CrossAttnDownBlock2D",
|
||||
"CrossAttnDownBlock2D",
|
||||
"CrossAttnDownBlock2D",
|
||||
"DownBlock2D",
|
||||
),
|
||||
mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
|
||||
only_cross_attention: Union[bool, Tuple[bool]] = False,
|
||||
block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
|
||||
layers_per_block: int = 2,
|
||||
downsample_padding: int = 1,
|
||||
mid_block_scale_factor: float = 1,
|
||||
act_fn: str = "silu",
|
||||
norm_num_groups: Optional[int] = 32,
|
||||
norm_eps: float = 1e-5,
|
||||
cross_attention_dim: int = 1280,
|
||||
transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
|
||||
encoder_hid_dim: Optional[int] = None,
|
||||
encoder_hid_dim_type: Optional[str] = None,
|
||||
attention_head_dim: Union[int, Tuple[int, ...]] = 8,
|
||||
num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
|
||||
use_linear_projection: bool = False,
|
||||
class_embed_type: Optional[str] = None,
|
||||
addition_embed_type: Optional[str] = None,
|
||||
addition_time_embed_dim: Optional[int] = None,
|
||||
num_class_embeds: Optional[int] = None,
|
||||
upcast_attention: bool = False,
|
||||
resnet_time_scale_shift: str = "default",
|
||||
projection_class_embeddings_input_dim: Optional[int] = None,
|
||||
controlnet_conditioning_channel_order: str = "rgb",
|
||||
conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
|
||||
global_pool_conditions: bool = False,
|
||||
addition_embed_type_num_heads: int = 64,
|
||||
):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `ControlNetModel` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet.ControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(
|
||||
in_channels=in_channels,
|
||||
conditioning_channels=conditioning_channels,
|
||||
flip_sin_to_cos=flip_sin_to_cos,
|
||||
freq_shift=freq_shift,
|
||||
down_block_types=down_block_types,
|
||||
mid_block_type=mid_block_type,
|
||||
only_cross_attention=only_cross_attention,
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=layers_per_block,
|
||||
downsample_padding=downsample_padding,
|
||||
mid_block_scale_factor=mid_block_scale_factor,
|
||||
act_fn=act_fn,
|
||||
norm_num_groups=norm_num_groups,
|
||||
norm_eps=norm_eps,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
transformer_layers_per_block=transformer_layers_per_block,
|
||||
encoder_hid_dim=encoder_hid_dim,
|
||||
encoder_hid_dim_type=encoder_hid_dim_type,
|
||||
attention_head_dim=attention_head_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
use_linear_projection=use_linear_projection,
|
||||
class_embed_type=class_embed_type,
|
||||
addition_embed_type=addition_embed_type,
|
||||
addition_time_embed_dim=addition_time_embed_dim,
|
||||
num_class_embeds=num_class_embeds,
|
||||
upcast_attention=upcast_attention,
|
||||
resnet_time_scale_shift=resnet_time_scale_shift,
|
||||
projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
|
||||
controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
|
||||
conditioning_embedding_out_channels=conditioning_embedding_out_channels,
|
||||
global_pool_conditions=global_pool_conditions,
|
||||
addition_embed_type_num_heads=addition_embed_type_num_heads,
|
||||
)
|
||||
deprecate("ControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class ControlNetConditioningEmbedding(ControlNetConditioningEmbedding):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `ControlNetConditioningEmbedding` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetConditioningEmbedding`, instead."
|
||||
deprecate("diffusers.models.controlnet.ControlNetConditioningEmbedding", "0.34", deprecation_message)
|
||||
deprecate("ControlNetConditioningEmbedding", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import List
|
||||
|
||||
from ..utils import deprecate, logging
|
||||
from .controlnets.controlnet_flux import FluxControlNetModel, FluxControlNetOutput, FluxMultiControlNetModel
|
||||
|
||||
@@ -25,46 +23,19 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
class FluxControlNetOutput(FluxControlNetOutput):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `FluxControlNetOutput` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxControlNetOutput`, instead."
|
||||
deprecate("diffusers.models.controlnet_flux.FluxControlNetOutput", "0.34", deprecation_message)
|
||||
deprecate("FluxControlNetOutput", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class FluxControlNetModel(FluxControlNetModel):
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 1,
|
||||
in_channels: int = 64,
|
||||
num_layers: int = 19,
|
||||
num_single_layers: int = 38,
|
||||
attention_head_dim: int = 128,
|
||||
num_attention_heads: int = 24,
|
||||
joint_attention_dim: int = 4096,
|
||||
pooled_projection_dim: int = 768,
|
||||
guidance_embeds: bool = False,
|
||||
axes_dims_rope: List[int] = [16, 56, 56],
|
||||
num_mode: int = None,
|
||||
conditioning_embedding_channels: int = None,
|
||||
):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `FluxControlNetModel` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet_flux.FluxControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(
|
||||
patch_size=patch_size,
|
||||
in_channels=in_channels,
|
||||
num_layers=num_layers,
|
||||
num_single_layers=num_single_layers,
|
||||
attention_head_dim=attention_head_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
joint_attention_dim=joint_attention_dim,
|
||||
pooled_projection_dim=pooled_projection_dim,
|
||||
guidance_embeds=guidance_embeds,
|
||||
axes_dims_rope=axes_dims_rope,
|
||||
num_mode=num_mode,
|
||||
conditioning_embedding_channels=conditioning_embedding_channels,
|
||||
)
|
||||
deprecate("FluxControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class FluxMultiControlNetModel(FluxMultiControlNetModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `FluxMultiControlNetModel` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxMultiControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet_flux.FluxMultiControlNetModel", "0.34", deprecation_message)
|
||||
deprecate("FluxMultiControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -23,46 +23,19 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
class SD3ControlNetOutput(SD3ControlNetOutput):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SD3ControlNetOutput` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3ControlNetOutput`, instead."
|
||||
deprecate("diffusers.models.controlnet_sd3.SD3ControlNetOutput", "0.34", deprecation_message)
|
||||
deprecate("SD3ControlNetOutput", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class SD3ControlNetModel(SD3ControlNetModel):
|
||||
def __init__(
|
||||
self,
|
||||
sample_size: int = 128,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 16,
|
||||
num_layers: int = 18,
|
||||
attention_head_dim: int = 64,
|
||||
num_attention_heads: int = 18,
|
||||
joint_attention_dim: int = 4096,
|
||||
caption_projection_dim: int = 1152,
|
||||
pooled_projection_dim: int = 2048,
|
||||
out_channels: int = 16,
|
||||
pos_embed_max_size: int = 96,
|
||||
extra_conditioning_channels: int = 0,
|
||||
):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SD3ControlNetModel` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3ControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet_sd3.SD3ControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(
|
||||
sample_size=sample_size,
|
||||
patch_size=patch_size,
|
||||
in_channels=in_channels,
|
||||
num_layers=num_layers,
|
||||
attention_head_dim=attention_head_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
joint_attention_dim=joint_attention_dim,
|
||||
caption_projection_dim=caption_projection_dim,
|
||||
pooled_projection_dim=pooled_projection_dim,
|
||||
out_channels=out_channels,
|
||||
pos_embed_max_size=pos_embed_max_size,
|
||||
extra_conditioning_channels=extra_conditioning_channels,
|
||||
)
|
||||
deprecate("SD3ControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class SD3MultiControlNetModel(SD3MultiControlNetModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SD3MultiControlNetModel` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3MultiControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet_sd3.SD3MultiControlNetModel", "0.34", deprecation_message)
|
||||
deprecate("SD3MultiControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
from ..utils import deprecate, logging
|
||||
from .controlnets.controlnet_sparsectrl import ( # noqa
|
||||
SparseControlNetConditioningEmbedding,
|
||||
@@ -30,87 +28,19 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
class SparseControlNetOutput(SparseControlNetOutput):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SparseControlNetOutput` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetOutput`, instead."
|
||||
deprecate("diffusers.models.controlnet_sparsectrl.SparseControlNetOutput", "0.34", deprecation_message)
|
||||
deprecate("SparseControlNetOutput", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class SparseControlNetConditioningEmbedding(SparseControlNetConditioningEmbedding):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SparseControlNetConditioningEmbedding` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetConditioningEmbedding`, instead."
|
||||
deprecate(
|
||||
"diffusers.models.controlnet_sparsectrl.SparseControlNetConditioningEmbedding", "0.34", deprecation_message
|
||||
)
|
||||
deprecate("SparseControlNetConditioningEmbedding", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class SparseControlNetModel(SparseControlNetModel):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int = 4,
|
||||
conditioning_channels: int = 4,
|
||||
flip_sin_to_cos: bool = True,
|
||||
freq_shift: int = 0,
|
||||
down_block_types: Tuple[str, ...] = (
|
||||
"CrossAttnDownBlockMotion",
|
||||
"CrossAttnDownBlockMotion",
|
||||
"CrossAttnDownBlockMotion",
|
||||
"DownBlockMotion",
|
||||
),
|
||||
only_cross_attention: Union[bool, Tuple[bool]] = False,
|
||||
block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
|
||||
layers_per_block: int = 2,
|
||||
downsample_padding: int = 1,
|
||||
mid_block_scale_factor: float = 1,
|
||||
act_fn: str = "silu",
|
||||
norm_num_groups: Optional[int] = 32,
|
||||
norm_eps: float = 1e-5,
|
||||
cross_attention_dim: int = 768,
|
||||
transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
|
||||
transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None,
|
||||
temporal_transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
|
||||
attention_head_dim: Union[int, Tuple[int, ...]] = 8,
|
||||
num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
|
||||
use_linear_projection: bool = False,
|
||||
upcast_attention: bool = False,
|
||||
resnet_time_scale_shift: str = "default",
|
||||
conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
|
||||
global_pool_conditions: bool = False,
|
||||
controlnet_conditioning_channel_order: str = "rgb",
|
||||
motion_max_seq_length: int = 32,
|
||||
motion_num_attention_heads: int = 8,
|
||||
concat_conditioning_mask: bool = True,
|
||||
use_simplified_condition_embedding: bool = True,
|
||||
):
|
||||
def __init__(self, *args, **kwargs):
|
||||
deprecation_message = "Importing `SparseControlNetModel` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetModel`, instead."
|
||||
deprecate("diffusers.models.controlnet_sparsectrl.SparseControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(
|
||||
in_channels=in_channels,
|
||||
conditioning_channels=conditioning_channels,
|
||||
flip_sin_to_cos=flip_sin_to_cos,
|
||||
freq_shift=freq_shift,
|
||||
down_block_types=down_block_types,
|
||||
only_cross_attention=only_cross_attention,
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=layers_per_block,
|
||||
downsample_padding=downsample_padding,
|
||||
mid_block_scale_factor=mid_block_scale_factor,
|
||||
act_fn=act_fn,
|
||||
norm_num_groups=norm_num_groups,
|
||||
norm_eps=norm_eps,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
transformer_layers_per_block=transformer_layers_per_block,
|
||||
transformer_layers_per_mid_block=transformer_layers_per_mid_block,
|
||||
temporal_transformer_layers_per_block=temporal_transformer_layers_per_block,
|
||||
attention_head_dim=attention_head_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
use_linear_projection=use_linear_projection,
|
||||
upcast_attention=upcast_attention,
|
||||
resnet_time_scale_shift=resnet_time_scale_shift,
|
||||
conditioning_embedding_out_channels=conditioning_embedding_out_channels,
|
||||
global_pool_conditions=global_pool_conditions,
|
||||
controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
|
||||
motion_max_seq_length=motion_max_seq_length,
|
||||
motion_num_attention_heads=motion_num_attention_heads,
|
||||
concat_conditioning_mask=concat_conditioning_mask,
|
||||
use_simplified_condition_embedding=use_simplified_condition_embedding,
|
||||
)
|
||||
deprecate("SparseControlNetModel", "0.34", deprecation_message)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -15,7 +15,6 @@ if is_torch_available():
|
||||
SparseControlNetModel,
|
||||
SparseControlNetOutput,
|
||||
)
|
||||
from .controlnet_union import ControlNetUnionModel
|
||||
from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
|
||||
from .multicontrolnet import MultiControlNetModel
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...loaders import PeftAdapterMixin
|
||||
from ...models.attention_processor import AttentionProcessor
|
||||
from ...models.modeling_utils import ModelMixin
|
||||
from ...utils import USE_PEFT_BACKEND, BaseOutput, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
||||
from ..controlnets.controlnet import ControlNetConditioningEmbedding, zero_module
|
||||
from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
||||
from ..controlnet import BaseOutput, ControlNetConditioningEmbedding, zero_module
|
||||
from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
|
||||
from ..modeling_outputs import Transformer2DModelOutput
|
||||
from ..transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
|
||||
@@ -192,13 +192,13 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
num_attention_heads: int = 24,
|
||||
load_weights_from_transformer=True,
|
||||
):
|
||||
config = dict(transformer.config)
|
||||
config = transformer.config
|
||||
config["num_layers"] = num_layers
|
||||
config["num_single_layers"] = num_single_layers
|
||||
config["attention_head_dim"] = attention_head_dim
|
||||
config["num_attention_heads"] = num_attention_heads
|
||||
|
||||
controlnet = cls.from_config(config)
|
||||
controlnet = cls(**config)
|
||||
|
||||
if load_weights_from_transformer:
|
||||
controlnet.pos_embed.load_state_dict(transformer.pos_embed.state_dict())
|
||||
|
||||
@@ -18,7 +18,7 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from ...configuration_utils import ConfigMixin, register_to_config
|
||||
from ...utils import BaseOutput, logging
|
||||
from ...utils import logging
|
||||
from ..attention_processor import AttentionProcessor
|
||||
from ..embeddings import (
|
||||
HunyuanCombinedTimestepTextSizeStyleEmbedding,
|
||||
@@ -27,7 +27,7 @@ from ..embeddings import (
|
||||
)
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..transformers.hunyuan_transformer_2d import HunyuanDiTBlock
|
||||
from .controlnet import Tuple, zero_module
|
||||
from .controlnet import BaseOutput, Tuple, zero_module
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user