Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 795f2c90fd | |||
| 84e2337807 |
@@ -12,96 +12,110 @@ env:
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: yes
|
||||
RUN_NIGHTLY: yes
|
||||
PIPELINE_USAGE_CUTOFF: 5000
|
||||
SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
jobs:
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines Matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
pip install huggingface_hub
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test-pipelines.json
|
||||
path: reports
|
||||
|
||||
run_nightly_tests_for_torch_pipelines:
|
||||
name: Torch Pipelines CUDA Nightly Tests
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
run_nightly_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
config:
|
||||
- name: Nightly PyTorch CUDA tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
report: torch_cuda
|
||||
- name: Nightly Flax TPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-tpu
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
report: flax_tpu
|
||||
- name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
report: onnx_cuda
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
if: ${{ matrix.config.runner == 'docker-gpu' }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Nightly PyTorch CUDA checkpoint (pipelines) tests
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pipeline_${{ matrix.module }}_test_reports
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
@@ -110,248 +124,6 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_for_other_torch_modules:
|
||||
name: Torch Non-Pipelines CUDA Nightly Tests
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
module: [models, schedulers, others, examples]
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v --make-reports=examples_torch_cuda \
|
||||
--report-log=examples_torch_cuda.log \
|
||||
examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_${{ matrix.module }}_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_lora_nightly_tests:
|
||||
name: Nightly LoRA Tests with PEFT and TORCH
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly LoRA tests with PEFT and Torch
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_lora_cuda \
|
||||
--report-log=tests_torch_lora_cuda.log \
|
||||
tests/lora
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_lora_cuda_stats.txt
|
||||
cat reports/tests_torch_lora_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_lora_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_flax_tpu_tests:
|
||||
name: Nightly Flax TPU Tests
|
||||
runs-on: docker-tpu
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_flax_tpu \
|
||||
--report-log=tests_flax_tpu.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_flax_tpu_stats.txt
|
||||
cat reports/tests_flax_tpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: flax_tpu_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_onnx_tests:
|
||||
name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_onnx_cuda \
|
||||
--report-log=tests_onnx_cuda.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_onnx_cuda_stats.txt
|
||||
cat reports/tests_onnx_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_apple_m1:
|
||||
name: Nightly PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
|
||||
@@ -35,10 +35,6 @@ jobs:
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
@@ -58,10 +54,6 @@ jobs:
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
|
||||
@@ -43,10 +43,6 @@ jobs:
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
@@ -66,10 +62,6 @@ jobs:
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
|
||||
@@ -88,7 +88,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -54,7 +54,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -84,7 +84,7 @@ Many of the basic parameters are described in the [DreamBooth](dreambooth#script
|
||||
- `--freeze_model`: freezes the key and value parameters in the cross-attention layer; the default is `crossattn_kv`, but you can set it to `crossattn` to train all the parameters in the cross-attention layer
|
||||
- `--concepts_list`: to learn multiple concepts, provide a path to a JSON file containing the concepts
|
||||
- `--modifier_token`: a special word used to represent the learned concept
|
||||
- `--initializer_token`: a special word used to initialize the embeddings of the `modifier_token`
|
||||
- `--initializer_token`:
|
||||
|
||||
### Prior preservation loss
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -180,7 +180,7 @@ elif args.pretrained_model_name_or_path:
|
||||
revision=args.revision,
|
||||
use_fast=False,
|
||||
)
|
||||
|
||||
|
||||
# Load scheduler and models
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
text_encoder = text_encoder_cls.from_pretrained(
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -89,7 +89,7 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt
|
||||
|
||||
As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the InstructPix2Pix relevant parts of the script.
|
||||
|
||||
The script begins by modifying the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
The script begins by modifing the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
|
||||
```py
|
||||
in_channels = 8
|
||||
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -235,7 +235,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-prior-pokemon-model"
|
||||
--output_dir="kandi2-prior-pokemon-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -259,7 +259,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-decoder-pokemon-model"
|
||||
--output_dir="kandi2-decoder-pokemon-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -252,4 +252,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
|
||||
Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
|
||||
|
||||
- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -69,7 +69,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -173,7 +173,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torc
|
||||
|
||||
caption = "A cute bird pokemon holding a shield"
|
||||
images = pipeline(
|
||||
caption,
|
||||
caption,
|
||||
width=1024,
|
||||
height=1536,
|
||||
prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
|
||||
|
||||
+1
-1
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
|
||||
| [**Dreambooth**](./dreambooth) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
|
||||
| [**ControlNet**](./controlnet) | ✅ | ✅ | -
|
||||
| [**InstructPix2Pix**](./instruct_pix2pix) | ✅ | ✅ | -
|
||||
| [**Reinforcement Learning for Control**](./reinforcement_learning) | - | - | coming soon.
|
||||
| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/reinforcement_learning/run_diffusers_locomotion.py) | - | - | coming soon.
|
||||
|
||||
## Community
|
||||
|
||||
|
||||
@@ -308,6 +308,6 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
|
||||
Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
|
||||
|
||||
## Running on Colab Notebook
|
||||
Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb).
|
||||
Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_advanced_example.ipynb).
|
||||
to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning)
|
||||
|
||||
|
||||
@@ -656,6 +656,7 @@ def parse_args(input_args=None):
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_dora",
|
||||
type=bool,
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
|
||||
@@ -935,7 +935,7 @@ image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
|
||||
### Checkpoint Merger Pipeline
|
||||
Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges upto 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format.
|
||||
|
||||
The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM Usage on Kaggle GPU kernels and
|
||||
The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect atleast 13GB RAM Usage on Kaggle GPU kernels and
|
||||
on colab you might run out of the 12GB memory even while merging two checkpoints.
|
||||
|
||||
Usage:-
|
||||
|
||||
@@ -103,7 +103,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
print(f"Combining with alpha={alpha}, interpolation mode={interp}")
|
||||
|
||||
checkpoint_count = len(pretrained_model_name_or_path_list)
|
||||
# Ignore result from model_index_json comparison of the two checkpoints
|
||||
# Ignore result from model_index_json comparision of the two checkpoints
|
||||
force = kwargs.pop("force", False)
|
||||
|
||||
# If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
|
||||
@@ -217,7 +217,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
]
|
||||
checkpoint_path_2 = files[0] if len(files) > 0 else None
|
||||
# For an attr if both checkpoint_path_1 and 2 are None, ignore.
|
||||
# If at least one is present, deal with it according to interp method, of course only if the state_dict keys match.
|
||||
# If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
|
||||
if checkpoint_path_1 is None and checkpoint_path_2 is None:
|
||||
print(f"Skipping {attr}: not present in 2nd or 3d model")
|
||||
continue
|
||||
|
||||
@@ -726,7 +726,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
embedding_interpolation_type (`str`, *optional*, defaults to `"lerp"`):
|
||||
The type of interpolation to use for interpolating between text embeddings. Choose between `"lerp"` and `"slerp"`.
|
||||
latent_interpolation_type (`str`, *optional*, defaults to `"slerp"`):
|
||||
@@ -779,7 +779,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
if batch_size < 2:
|
||||
raise ValueError(f"`prompt` must have length of at least 2 but found {batch_size}")
|
||||
raise ValueError(f"`prompt` must have length of atleast 2 but found {batch_size}")
|
||||
if num_images_per_prompt != 1:
|
||||
raise ValueError("`num_images_per_prompt` must be `1` as no other value is supported yet")
|
||||
if prompt_embeds is not None:
|
||||
@@ -883,7 +883,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
) as batch_progress_bar:
|
||||
for batch_index in range(0, bs, process_batch_size):
|
||||
batch_inference_latents = inference_latents[batch_index : batch_index + process_batch_size]
|
||||
batch_inference_embeddings = inference_embeddings[
|
||||
batch_inference_embedddings = inference_embeddings[
|
||||
batch_index : batch_index + process_batch_size
|
||||
]
|
||||
|
||||
@@ -892,7 +892,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
)
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
current_bs = batch_inference_embeddings.shape[0]
|
||||
current_bs = batch_inference_embedddings.shape[0]
|
||||
w = torch.tensor(self.guidance_scale - 1).repeat(current_bs)
|
||||
w_embedding = self.get_guidance_scale_embedding(
|
||||
w, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
@@ -901,14 +901,14 @@ class LatentConsistencyModelWalkPipeline(
|
||||
# 10. Perform inference for current batch
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for index, t in enumerate(timesteps):
|
||||
batch_inference_latents = batch_inference_latents.to(batch_inference_embeddings.dtype)
|
||||
batch_inference_latents = batch_inference_latents.to(batch_inference_embedddings.dtype)
|
||||
|
||||
# model prediction (v-prediction, eps, x)
|
||||
model_pred = self.unet(
|
||||
batch_inference_latents,
|
||||
t,
|
||||
timestep_cond=w_embedding,
|
||||
encoder_hidden_states=batch_inference_embeddings,
|
||||
encoder_hidden_states=batch_inference_embedddings,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -924,8 +924,8 @@ class LatentConsistencyModelWalkPipeline(
|
||||
callback_outputs = callback_on_step_end(self, index, t, callback_kwargs)
|
||||
|
||||
batch_inference_latents = callback_outputs.pop("latents", batch_inference_latents)
|
||||
batch_inference_embeddings = callback_outputs.pop(
|
||||
"prompt_embeds", batch_inference_embeddings
|
||||
batch_inference_embedddings = callback_outputs.pop(
|
||||
"prompt_embeds", batch_inference_embedddings
|
||||
)
|
||||
w_embedding = callback_outputs.pop("w_embedding", w_embedding)
|
||||
denoised = callback_outputs.pop("denoised", denoised)
|
||||
@@ -939,7 +939,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
step_idx = index // getattr(self.scheduler, "order", 1)
|
||||
callback(step_idx, t, batch_inference_latents)
|
||||
|
||||
denoised = denoised.to(batch_inference_embeddings.dtype)
|
||||
denoised = denoised.to(batch_inference_embedddings.dtype)
|
||||
|
||||
# Note: This is not supported because you would get black images in your latent walk if
|
||||
# NSFW concept is detected
|
||||
|
||||
@@ -530,7 +530,7 @@ class LLMGroundedDiffusionPipeline(
|
||||
)
|
||||
|
||||
if len(phrases) != len(boxes):
|
||||
raise ValueError(
|
||||
ValueError(
|
||||
"length of `phrases` and `boxes` has to be same, but"
|
||||
f" got: `phrases` {len(phrases)} != `boxes` {len(boxes)}"
|
||||
)
|
||||
|
||||
@@ -164,7 +164,7 @@ def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
|
||||
text_tokens (list)
|
||||
A list contains token ids
|
||||
text_weight (list)
|
||||
A list contains the correspondent weight of token ids
|
||||
A list contains the correspodent weight of token ids
|
||||
|
||||
Example:
|
||||
import torch
|
||||
@@ -1028,7 +1028,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -1531,7 +1531,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -2131,7 +2131,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Override to properly handle the loading and unloading of the additional text encoder.
|
||||
# Overrride to properly handle the loading and unloading of the additional text encoder.
|
||||
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
|
||||
# We could have accessed the unet config from `lora_state_dict()` too. We pass
|
||||
# it here explicitly to be able to tell that it's coming from an SDXL
|
||||
|
||||
@@ -196,7 +196,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
|
||||
seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
|
||||
@@ -325,7 +325,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
# Mask for tile weights strength
|
||||
# Mask for tile weights strenght
|
||||
tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
|
||||
|
||||
# Diffusion timesteps
|
||||
|
||||
@@ -832,7 +832,7 @@ class AnimateDiffControlNetPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
allback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
@@ -840,7 +840,7 @@ class AnimateDiffControlNetPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1280,7 +1280,7 @@ class DemoFusionSDXLPipeline(
|
||||
|
||||
return output_images
|
||||
|
||||
# Override to properly handle the loading and unloading of the additional text encoder.
|
||||
# Overrride to properly handle the loading and unloading of the additional text encoder.
|
||||
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
|
||||
# We could have accessed the unet config from `lora_state_dict()` too. We pass
|
||||
# it here explicitly to be able to tell that it's coming from an SDXL
|
||||
|
||||
@@ -887,7 +887,7 @@ class StyleAlignedSDXLPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
|
||||
from diffusers.pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
|
||||
from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
|
||||
from diffusers.utils import (
|
||||
USE_PEFT_BACKEND,
|
||||
|
||||
@@ -1073,7 +1073,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
|
||||
@@ -701,7 +701,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -125,11 +125,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
|
||||
)
|
||||
|
||||
image_logs = []
|
||||
inference_ctx = (
|
||||
contextlib.nullcontext()
|
||||
if (is_final_validation or torch.backends.mps.is_available())
|
||||
else torch.autocast("cuda")
|
||||
)
|
||||
inference_ctx = contextlib.nullcontext() if is_final_validation else torch.autocast("cuda")
|
||||
|
||||
for validation_prompt, validation_image in zip(validation_prompts, validation_images):
|
||||
validation_image = Image.open(validation_image).convert("RGB")
|
||||
@@ -796,12 +792,6 @@ def main(args):
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
|
||||
@@ -259,17 +259,13 @@ The authors found that by using DoRA, both the learning capacity and training st
|
||||
> This is also aligned with some of the quantitative analysis shown in the paper.
|
||||
|
||||
**Usage**
|
||||
1. To use DoRA you need to upgrade the installation of `peft`:
|
||||
1. To use DoRA you need to install `peft` from main:
|
||||
```bash
|
||||
pip install-U peft
|
||||
pip install git+https://github.com/huggingface/peft.git
|
||||
```
|
||||
2. Enable DoRA training by adding this flag
|
||||
```bash
|
||||
--use_dora
|
||||
```
|
||||
**Inference**
|
||||
The inference is the same as if you train a regular LoRA 🤗
|
||||
|
||||
## Format compatibility
|
||||
|
||||
You can pass `--output_kohya_format` to additionally generate a state dictionary which should be compatible with other platforms and tools such as Automatic 1111, Comfy, Kohya, etc. The `output_dir` will contain a file named "pytorch_lora_weights_kohya.safetensors".
|
||||
The inference is the same as if you train a regular LoRA 🤗
|
||||
@@ -14,6 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import gc
|
||||
import itertools
|
||||
import json
|
||||
@@ -40,7 +41,6 @@ from peft import LoraConfig, set_peft_model_state_dict
|
||||
from peft.utils import get_peft_model_state_dict
|
||||
from PIL import Image
|
||||
from PIL.ImageOps import exif_transpose
|
||||
from safetensors.torch import load_file, save_file
|
||||
from torch.utils.data import Dataset
|
||||
from torchvision import transforms
|
||||
from torchvision.transforms.functional import crop
|
||||
@@ -62,9 +62,7 @@ from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
|
||||
from diffusers.utils import (
|
||||
check_min_version,
|
||||
convert_all_state_dict_to_peft,
|
||||
convert_state_dict_to_diffusers,
|
||||
convert_state_dict_to_kohya,
|
||||
convert_unet_state_dict_to_peft,
|
||||
is_wandb_available,
|
||||
)
|
||||
@@ -207,18 +205,11 @@ def log_validation(
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
# Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
|
||||
# way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
|
||||
enable_autocast = True
|
||||
if torch.backends.mps.is_available() or (
|
||||
accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
|
||||
):
|
||||
enable_autocast = False
|
||||
if "playground" in args.pretrained_model_name_or_path:
|
||||
enable_autocast = False
|
||||
inference_ctx = (
|
||||
contextlib.nullcontext() if "playground" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
|
||||
)
|
||||
|
||||
with torch.autocast(
|
||||
accelerator.device.type,
|
||||
enabled=enable_autocast,
|
||||
):
|
||||
with inference_ctx:
|
||||
images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
|
||||
|
||||
for tracker in accelerator.trackers:
|
||||
@@ -236,8 +227,7 @@ def log_validation(
|
||||
)
|
||||
|
||||
del pipeline
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return images
|
||||
|
||||
@@ -406,11 +396,6 @@ def parse_args(input_args=None):
|
||||
default="lora-dreambooth-model",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_kohya_format",
|
||||
action="store_true",
|
||||
help="Flag to additionally generate final state dict in the Kohya format so that it becomes compatible with A111, Comfy, Kohya, etc.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
@@ -974,12 +959,6 @@ def main(args):
|
||||
if args.do_edm_style_training and args.snr_gamma is not None:
|
||||
raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
@@ -1022,8 +1001,7 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
|
||||
torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
|
||||
torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
@@ -1148,12 +1126,6 @@ def main(args):
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
|
||||
if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
# Move unet, vae and text_encoder to device and cast to weight_dtype
|
||||
unet.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
@@ -1298,7 +1270,7 @@ def main(args):
|
||||
|
||||
# Enable TF32 for faster training on Ampere GPUs,
|
||||
# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
|
||||
if args.allow_tf32 and torch.cuda.is_available():
|
||||
if args.allow_tf32:
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
if args.scale_lr:
|
||||
@@ -1475,8 +1447,7 @@ def main(args):
|
||||
if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
|
||||
del tokenizers, text_encoders
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
|
||||
# pack the statically computed variables appropriately here. This is so that we don't
|
||||
@@ -1919,11 +1890,6 @@ def main(args):
|
||||
text_encoder_lora_layers=text_encoder_lora_layers,
|
||||
text_encoder_2_lora_layers=text_encoder_2_lora_layers,
|
||||
)
|
||||
if args.output_kohya_format:
|
||||
lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
|
||||
peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
|
||||
kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
|
||||
save_file(kohya_state_dict, f"{args.output_dir}/pytorch_lora_weights_kohya.safetensors")
|
||||
|
||||
# Final inference
|
||||
# Load previous pipeline
|
||||
|
||||
@@ -71,7 +71,12 @@ TORCH_DTYPE_MAPPING = {"fp32": torch.float32, "fp16": torch.float16, "bf16": tor
|
||||
|
||||
|
||||
def log_validation(
|
||||
pipeline, args, accelerator, generator, global_step, is_final_validation=False, enable_autocast=True
|
||||
pipeline,
|
||||
args,
|
||||
accelerator,
|
||||
generator,
|
||||
global_step,
|
||||
is_final_validation=False,
|
||||
):
|
||||
logger.info(
|
||||
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
|
||||
@@ -91,7 +96,7 @@ def log_validation(
|
||||
else Image.open(image_url_or_path).convert("RGB")
|
||||
)(args.val_image_url_or_path)
|
||||
|
||||
with torch.autocast(accelerator.device.type, enabled=enable_autocast):
|
||||
with torch.autocast(str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"):
|
||||
edited_images = []
|
||||
# Run inference
|
||||
for val_img_idx in range(args.num_validation_images):
|
||||
@@ -492,13 +497,6 @@ def main():
|
||||
),
|
||||
)
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
||||
@@ -983,13 +981,6 @@ def main():
|
||||
if accelerator.is_main_process:
|
||||
accelerator.init_trackers("instruct-pix2pix-xl", config=vars(args))
|
||||
|
||||
# Some configurations require autocast to be disabled.
|
||||
enable_autocast = True
|
||||
if torch.backends.mps.is_available() or (
|
||||
accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
|
||||
):
|
||||
enable_autocast = False
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
|
||||
@@ -1202,7 +1193,6 @@ def main():
|
||||
generator,
|
||||
global_step,
|
||||
is_final_validation=False,
|
||||
enable_autocast=enable_autocast,
|
||||
)
|
||||
|
||||
if args.use_ema:
|
||||
@@ -1252,7 +1242,6 @@ def main():
|
||||
generator,
|
||||
global_step,
|
||||
is_final_validation=True,
|
||||
enable_autocast=enable_autocast,
|
||||
)
|
||||
|
||||
accelerator.end_training()
|
||||
|
||||
@@ -1,121 +0,0 @@
|
||||
This project is an attempt to check if it's possible to apply to [ORPO](https://arxiv.org/abs/2403.07691) on a text-conditioned diffusion model to align it on preference data WITHOUT a reference model. The implementation is based on https://github.com/huggingface/trl/pull/1435/.
|
||||
|
||||
> [!WARNING]
|
||||
> We assume that MSE in the diffusion formulation approximates the log-probs as required by ORPO (hat-tip to [@kashif](https://github.com/kashif) for the idea). So, please consider this to be extremely experimental.
|
||||
|
||||
## Training
|
||||
|
||||
Here's training command you can use on a 40GB A100 to validate things on a [small preference
|
||||
dataset](https://hf.co/datasets/kashif/pickascore):
|
||||
|
||||
```bash
|
||||
accelerate launch train_diffusion_orpo_sdxl_lora.py \
|
||||
--pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 \
|
||||
--pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
|
||||
--output_dir="diffusion-sdxl-orpo" \
|
||||
--mixed_precision="fp16" \
|
||||
--dataset_name=kashif/pickascore \
|
||||
--train_batch_size=8 \
|
||||
--gradient_accumulation_steps=2 \
|
||||
--gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--rank=8 \
|
||||
--learning_rate=1e-5 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=2000 \
|
||||
--checkpointing_steps=500 \
|
||||
--run_validation --validation_steps=50 \
|
||||
--seed="0" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
We also provide a simple script to scale up the training on the [yuvalkirstain/pickapic_v2](https://huggingface.co/datasets/yuvalkirstain/pickapic_v2) dataset:
|
||||
|
||||
```bash
|
||||
accelerate launch --multi_gpu train_diffusion_orpo_sdxl_lora_wds.py \
|
||||
--pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 \
|
||||
--pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
|
||||
--dataset_path="pipe:aws s3 cp s3://diffusion-preference-opt/{00000..00644}.tar -" \
|
||||
--output_dir="diffusion-sdxl-orpo-wds" \
|
||||
--mixed_precision="fp16" \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--rank=8 \
|
||||
--dataloader_num_workers=8 \
|
||||
--learning_rate=3e-5 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=50000 \
|
||||
--checkpointing_steps=2000 \
|
||||
--run_validation --validation_steps=500 \
|
||||
--seed="0" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
We tested the above on a node of 8 H100s but it should also work on A100s. It requires the `webdataset` library for faster dataloading. Note that we kept the dataset shards on an S3 bucket but it should be also possible to have them stored locally.
|
||||
|
||||
You can use the code below to convert the original dataset into `webdataset` shards:
|
||||
|
||||
```python
|
||||
import os
|
||||
import io
|
||||
import ray
|
||||
import webdataset as wds
|
||||
from datasets import Dataset
|
||||
from PIL import Image
|
||||
|
||||
ray.init(num_cpus=8)
|
||||
|
||||
|
||||
def convert_to_image(im_bytes):
|
||||
return Image.open(io.BytesIO(im_bytes)).convert("RGB")
|
||||
|
||||
def main():
|
||||
dataset_path = "/pickapic_v2/data"
|
||||
wds_shards_path = "/pickapic_v2_webdataset"
|
||||
# get all .parquet files in the dataset path
|
||||
dataset_files = [
|
||||
os.path.join(dataset_path, f)
|
||||
for f in os.listdir(dataset_path)
|
||||
if f.endswith(".parquet")
|
||||
]
|
||||
|
||||
@ray.remote
|
||||
def create_shard(path):
|
||||
# get basename of the file
|
||||
basename = os.path.basename(path)
|
||||
# get the shard number data-00123-of-01034.parquet -> 00123
|
||||
shard_num = basename.split("-")[1]
|
||||
dataset = Dataset.from_parquet(path)
|
||||
# create a webdataset shard
|
||||
shard = wds.TarWriter(os.path.join(wds_shards_path, f"{shard_num}.tar"))
|
||||
|
||||
for i, example in enumerate(dataset):
|
||||
wds_example = {
|
||||
"__key__": str(i),
|
||||
"original_prompt.txt": example["caption"],
|
||||
"jpg_0.jpg": convert_to_image(example["jpg_0"]),
|
||||
"jpg_1.jpg": convert_to_image(example["jpg_1"]),
|
||||
"label_0.txt": str(example["label_0"]),
|
||||
"label_1.txt": str(example["label_1"])
|
||||
}
|
||||
shard.write(wds_example)
|
||||
shard.close()
|
||||
|
||||
futures = [create_shard.remote(path) for path in dataset_files]
|
||||
ray.get(futures)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
Refer to [sayakpaul/diffusion-sdxl-orpo](https://huggingface.co/sayakpaul/diffusion-sdxl-orpo) for an experimental checkpoint.
|
||||
@@ -1,7 +0,0 @@
|
||||
datasets
|
||||
accelerate
|
||||
transformers
|
||||
torchvision
|
||||
wandb
|
||||
peft
|
||||
webdataset
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1005,7 +1005,7 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -501,12 +501,6 @@ def main(args):
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
|
||||
accelerator = Accelerator(
|
||||
@@ -979,13 +973,6 @@ def main(args):
|
||||
if accelerator.is_main_process:
|
||||
accelerator.init_trackers("text2image-fine-tune", config=vars(args))
|
||||
|
||||
# Some configurations require autocast to be disabled.
|
||||
enable_autocast = True
|
||||
if torch.backends.mps.is_available() or (
|
||||
accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
|
||||
):
|
||||
enable_autocast = False
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
|
||||
@@ -1212,10 +1199,7 @@ def main(args):
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
pipeline_args = {"prompt": args.validation_prompt}
|
||||
|
||||
with torch.autocast(
|
||||
accelerator.device.type,
|
||||
enabled=enable_autocast,
|
||||
):
|
||||
with torch.cuda.amp.autocast():
|
||||
images = [
|
||||
pipeline(**pipeline_args, generator=generator).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
|
||||
@@ -590,12 +590,6 @@ def main(args):
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
"Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
|
||||
)
|
||||
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
||||
mixed_precision=args.mixed_precision,
|
||||
@@ -986,13 +980,6 @@ def main(args):
|
||||
model = model._orig_mod if is_compiled_module(model) else model
|
||||
return model
|
||||
|
||||
# Some configurations require autocast to be disabled.
|
||||
enable_autocast = True
|
||||
if torch.backends.mps.is_available() or (
|
||||
accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
|
||||
):
|
||||
enable_autocast = False
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
|
||||
@@ -1226,10 +1213,7 @@ def main(args):
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
pipeline_args = {"prompt": args.validation_prompt}
|
||||
|
||||
with torch.autocast(
|
||||
accelerator.device.type,
|
||||
enabled=enable_autocast,
|
||||
):
|
||||
with torch.cuda.amp.autocast():
|
||||
images = [
|
||||
pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
@@ -1284,7 +1268,7 @@ def main(args):
|
||||
if args.validation_prompt and args.num_validation_images > 0:
|
||||
pipeline = pipeline.to(accelerator.device)
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
with torch.autocast(accelerator.device.type, enabled=enable_autocast):
|
||||
with torch.cuda.amp.autocast():
|
||||
images = [
|
||||
pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
|
||||
@@ -72,11 +72,7 @@ To create the package for PyPI.
|
||||
9. Upload the final version to the actual PyPI:
|
||||
twine upload dist/* -r pypi
|
||||
|
||||
10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory. You can use the following
|
||||
Space to fetch all the commits applicable for the release: https://huggingface.co/spaces/lysandre/github-release. Repo should
|
||||
be `huggingface/diffusers`. `tag` should be the previous release tag (v0.26.1, for example), and `branch` should be
|
||||
the latest release branch (v0.27.0-release, for example). It denotes all commits that have happened on branch
|
||||
v0.27.0-release after the tag v0.26.1 was created.
|
||||
10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory.
|
||||
|
||||
11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release,
|
||||
you need to go back to main before executing this.
|
||||
@@ -85,8 +81,9 @@ To create the package for PyPI.
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from distutils.core import Command
|
||||
|
||||
from setuptools import Command, find_packages, setup
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
# IMPORTANT:
|
||||
@@ -166,7 +163,7 @@ def deps_list(*pkgs):
|
||||
|
||||
class DepsTableUpdateCommand(Command):
|
||||
"""
|
||||
A custom command that updates the dependency table.
|
||||
A custom distutils command that updates the dependency table.
|
||||
usage: python setup.py deps_table_update
|
||||
"""
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ from ..utils import (
|
||||
get_adapter_name,
|
||||
get_peft_kwargs,
|
||||
is_accelerate_available,
|
||||
is_peft_version,
|
||||
is_transformers_available,
|
||||
logging,
|
||||
recurse_remove_peft_layers,
|
||||
@@ -114,7 +113,7 @@ class LoraLoaderMixin:
|
||||
# First, ensure that the checkpoint is a compatible one and can be successfully loaded.
|
||||
state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
|
||||
|
||||
is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
|
||||
is_correct_format = all("lora" in key for key in state_dict.keys())
|
||||
if not is_correct_format:
|
||||
raise ValueError("Invalid LoRA checkpoint.")
|
||||
|
||||
@@ -452,15 +451,6 @@ class LoraLoaderMixin:
|
||||
rank[key] = val.shape[1]
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=True)
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -582,15 +572,6 @@ class LoraLoaderMixin:
|
||||
}
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"]:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -673,13 +654,6 @@ class LoraLoaderMixin:
|
||||
rank[key] = val.shape[1]
|
||||
|
||||
lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict)
|
||||
if "use_dora" in lora_config_kwargs:
|
||||
if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
else:
|
||||
lora_config_kwargs.pop("use_dora")
|
||||
lora_config = LoraConfig(**lora_config_kwargs)
|
||||
|
||||
# adapter_name
|
||||
@@ -1269,7 +1243,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
|
||||
unet_config=self.unet.config,
|
||||
**kwargs,
|
||||
)
|
||||
is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
|
||||
is_correct_format = all("lora" in key for key in state_dict.keys())
|
||||
if not is_correct_format:
|
||||
raise ValueError("Invalid LoRA checkpoint.")
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import re
|
||||
|
||||
from ..utils import is_peft_version, logging
|
||||
from ..utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -128,15 +128,6 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
|
||||
te_state_dict = {}
|
||||
te2_state_dict = {}
|
||||
network_alphas = {}
|
||||
is_unet_dora_lora = any("dora_scale" in k and "lora_unet_" in k for k in state_dict)
|
||||
is_te_dora_lora = any("dora_scale" in k and ("lora_te_" in k or "lora_te1_" in k) for k in state_dict)
|
||||
is_te2_dora_lora = any("dora_scale" in k and "lora_te2_" in k for k in state_dict)
|
||||
|
||||
if is_unet_dora_lora or is_te_dora_lora or is_te2_dora_lora:
|
||||
if is_peft_version("<", "0.9.0"):
|
||||
raise ValueError(
|
||||
"You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
|
||||
)
|
||||
|
||||
# every down weight has a corresponding up weight and potentially an alpha weight
|
||||
lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
|
||||
@@ -207,12 +198,6 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
|
||||
unet_state_dict[diffusers_name] = state_dict.pop(key)
|
||||
unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
|
||||
|
||||
if is_unet_dora_lora:
|
||||
dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down."
|
||||
unet_state_dict[
|
||||
diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")
|
||||
] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
|
||||
|
||||
elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
|
||||
if lora_name.startswith(("lora_te_", "lora_te1_")):
|
||||
key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
|
||||
@@ -244,19 +229,6 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
|
||||
te2_state_dict[diffusers_name] = state_dict.pop(key)
|
||||
te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
|
||||
|
||||
if (is_te_dora_lora or is_te2_dora_lora) and lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
|
||||
dora_scale_key_to_replace_te = (
|
||||
"_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer."
|
||||
)
|
||||
if lora_name.startswith(("lora_te_", "lora_te1_")):
|
||||
te_state_dict[
|
||||
diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
|
||||
] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
|
||||
elif lora_name.startswith("lora_te2_"):
|
||||
te2_state_dict[
|
||||
diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
|
||||
] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
|
||||
|
||||
# Rename the alphas so that they can be mapped appropriately.
|
||||
if lora_name_alpha in state_dict:
|
||||
alpha = state_dict.pop(lora_name_alpha).item()
|
||||
|
||||
@@ -50,8 +50,6 @@ if is_transformers_available():
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
CONFIG_URLS = {
|
||||
@@ -979,6 +977,8 @@ def create_diffusers_controlnet_model_from_ldm(
|
||||
controlnet = ControlNetModel(**diffusers_config)
|
||||
|
||||
if is_accelerate_available():
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
unexpected_keys = load_model_dict_into_meta(
|
||||
controlnet, diffusers_format_controlnet_checkpoint, dtype=torch_dtype
|
||||
)
|
||||
@@ -1155,6 +1155,8 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_
|
||||
text_model_dict[diffusers_key] = checkpoint[key]
|
||||
|
||||
if is_accelerate_available():
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
|
||||
if text_model._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in text_model._keys_to_ignore_on_load_unexpected:
|
||||
@@ -1248,6 +1250,8 @@ def create_text_encoder_from_open_clip_checkpoint(
|
||||
text_model_dict[diffusers_key] = checkpoint[key]
|
||||
|
||||
if is_accelerate_available():
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
|
||||
if text_model._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in text_model._keys_to_ignore_on_load_unexpected:
|
||||
@@ -1313,6 +1317,8 @@ def create_diffusers_unet_model_from_ldm(
|
||||
unet = UNet2DConditionModel(**unet_config)
|
||||
|
||||
if is_accelerate_available():
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint, dtype=torch_dtype)
|
||||
if unet._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in unet._keys_to_ignore_on_load_unexpected:
|
||||
@@ -1373,6 +1379,8 @@ def create_diffusers_vae_model_from_ldm(
|
||||
vae = AutoencoderKL(**vae_config)
|
||||
|
||||
if is_accelerate_available():
|
||||
from ..models.modeling_utils import load_model_dict_into_meta
|
||||
|
||||
unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint, dtype=torch_dtype)
|
||||
if vae._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in vae._keys_to_ignore_on_load_unexpected:
|
||||
|
||||
@@ -63,8 +63,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
... "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
|
||||
>>> image
|
||||
>>> pipe("horse", generator=torch.manual_seed(0)).images
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -73,7 +72,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
self,
|
||||
scaling_factor: float = 0.18215,
|
||||
latent_channels: int = 4,
|
||||
sample_size: int = 32,
|
||||
encoder_act_fn: str = "silu",
|
||||
encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
|
||||
encoder_double_z: bool = True,
|
||||
@@ -155,16 +153,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
self.use_slicing = False
|
||||
self.use_tiling = False
|
||||
|
||||
# only relevant if vae tiling is enabled
|
||||
self.tile_sample_min_size = self.config.sample_size
|
||||
sample_size = (
|
||||
self.config.sample_size[0]
|
||||
if isinstance(self.config.sample_size, (list, tuple))
|
||||
else self.config.sample_size
|
||||
)
|
||||
self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
|
||||
self.tile_overlap_factor = 0.25
|
||||
|
||||
# Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
|
||||
def enable_tiling(self, use_tiling: bool = True):
|
||||
r"""
|
||||
@@ -284,7 +272,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
Args:
|
||||
x (`torch.FloatTensor`): Input batch of images.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain
|
||||
Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
@@ -317,19 +305,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
return_dict: bool = True,
|
||||
num_inference_steps: int = 2,
|
||||
) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
|
||||
"""
|
||||
Decodes the input latent vector `z` using the consistency decoder VAE model.
|
||||
|
||||
Args:
|
||||
z (torch.FloatTensor): The input latent vector.
|
||||
generator (Optional[torch.Generator]): The random number generator. Default is None.
|
||||
return_dict (bool): Whether to return the output as a dictionary. Default is True.
|
||||
num_inference_steps (int): The number of inference steps. Default is 2.
|
||||
|
||||
Returns:
|
||||
Union[DecoderOutput, Tuple[torch.FloatTensor]]: The decoded output.
|
||||
|
||||
"""
|
||||
z = (z * self.config.scaling_factor - self.means) / self.stds
|
||||
|
||||
scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
|
||||
@@ -370,9 +345,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
|
||||
return b
|
||||
|
||||
def tiled_encode(
|
||||
self, x: torch.FloatTensor, return_dict: bool = True
|
||||
) -> Union[ConsistencyDecoderVAEOutput, Tuple]:
|
||||
def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
|
||||
r"""Encode a batch of images using a tiled encoder.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
|
||||
|
||||
@@ -663,7 +663,7 @@ class AnimateDiffPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -792,7 +792,7 @@ class AnimateDiffPipeline(
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
|
||||
# 8. Denoising loop
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
|
||||
@@ -823,7 +823,7 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -944,7 +944,7 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
|
||||
# 8. Denoising loop
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
|
||||
@@ -661,9 +661,9 @@ class StableDiffusionControlNetPipeline(
|
||||
raise ValueError(
|
||||
f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
|
||||
)
|
||||
else:
|
||||
for image_ in image:
|
||||
self.check_image(image_, prompt, prompt_embeds)
|
||||
|
||||
for image_ in image:
|
||||
self.check_image(image_, prompt, prompt_embeds)
|
||||
else:
|
||||
assert False
|
||||
|
||||
@@ -1002,7 +1002,7 @@ class StableDiffusionControlNetPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1012,7 +1012,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1241,7 +1241,7 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1022,7 +1022,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -1313,7 +1313,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -1601,7 +1601,10 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
|
||||
for s, e in zip(control_guidance_start, control_guidance_end)
|
||||
]
|
||||
controlnet_keep.append(keeps if isinstance(controlnet, MultiControlNetModel) else keeps[0])
|
||||
if isinstance(self.controlnet, MultiControlNetModel):
|
||||
controlnet_keep.append(keeps)
|
||||
else:
|
||||
controlnet_keep.append(keeps[0])
|
||||
|
||||
# 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
height, width = latents.shape[-2:]
|
||||
|
||||
@@ -1102,7 +1102,7 @@ class StableDiffusionXLControlNetPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1251,7 +1251,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -146,40 +146,39 @@ class FreeInitMixin:
|
||||
):
|
||||
if free_init_iteration == 0:
|
||||
self._free_init_initial_noise = latents.detach().clone()
|
||||
else:
|
||||
latent_shape = latents.shape
|
||||
return latents, self.scheduler.timesteps
|
||||
|
||||
free_init_filter_shape = (1, *latent_shape[1:])
|
||||
free_init_freq_filter = self._get_free_init_freq_filter(
|
||||
shape=free_init_filter_shape,
|
||||
device=device,
|
||||
filter_type=self._free_init_method,
|
||||
order=self._free_init_order,
|
||||
spatial_stop_frequency=self._free_init_spatial_stop_frequency,
|
||||
temporal_stop_frequency=self._free_init_temporal_stop_frequency,
|
||||
)
|
||||
latent_shape = latents.shape
|
||||
|
||||
current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
|
||||
diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
|
||||
free_init_filter_shape = (1, *latent_shape[1:])
|
||||
free_init_freq_filter = self._get_free_init_freq_filter(
|
||||
shape=free_init_filter_shape,
|
||||
device=device,
|
||||
filter_type=self._free_init_method,
|
||||
order=self._free_init_order,
|
||||
spatial_stop_frequency=self._free_init_spatial_stop_frequency,
|
||||
temporal_stop_frequency=self._free_init_temporal_stop_frequency,
|
||||
)
|
||||
|
||||
z_t = self.scheduler.add_noise(
|
||||
original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
|
||||
).to(dtype=torch.float32)
|
||||
current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
|
||||
diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
|
||||
|
||||
z_rand = randn_tensor(
|
||||
shape=latent_shape,
|
||||
generator=generator,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
|
||||
latents = latents.to(dtype)
|
||||
z_t = self.scheduler.add_noise(
|
||||
original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
|
||||
).to(dtype=torch.float32)
|
||||
|
||||
z_rand = randn_tensor(
|
||||
shape=latent_shape,
|
||||
generator=generator,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
|
||||
latents = latents.to(dtype)
|
||||
|
||||
# Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
|
||||
if self._free_init_use_fast_sampling:
|
||||
num_inference_steps = max(
|
||||
1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
|
||||
)
|
||||
num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
|
||||
return latents, self.scheduler.timesteps
|
||||
|
||||
@@ -178,7 +178,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
||||
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
||||
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
||||
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
||||
@@ -186,8 +186,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
||||
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
||||
`enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def progress_bar(self, iterable=None, total=None):
|
||||
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
||||
@@ -405,17 +405,17 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
||||
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
||||
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
||||
|
||||
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_model_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
||||
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
||||
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
||||
"""
|
||||
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_model_cpu_offload()
|
||||
self.decoder_pipe.enable_model_cpu_offload()
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
||||
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
||||
@@ -423,8 +423,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
||||
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
||||
`enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def progress_bar(self, iterable=None, total=None):
|
||||
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
||||
@@ -653,7 +653,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
||||
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
||||
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
||||
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
||||
@@ -661,8 +661,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
||||
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
||||
`enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def progress_bar(self, iterable=None, total=None):
|
||||
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
||||
|
||||
@@ -13,12 +13,14 @@
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
import torch.fft as fft
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
@@ -128,6 +130,71 @@ def prepare_mask_coef_by_statistics(num_frames: int, cond_frame: int, motion_sca
|
||||
return coef
|
||||
|
||||
|
||||
def _get_freeinit_freq_filter(
|
||||
shape: Tuple[int, ...],
|
||||
device: Union[str, torch.dtype],
|
||||
filter_type: str,
|
||||
order: float,
|
||||
spatial_stop_frequency: float,
|
||||
temporal_stop_frequency: float,
|
||||
) -> torch.Tensor:
|
||||
r"""Returns the FreeInit filter based on filter type and other input conditions."""
|
||||
|
||||
time, height, width = shape[-3], shape[-2], shape[-1]
|
||||
mask = torch.zeros(shape)
|
||||
|
||||
if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
|
||||
return mask
|
||||
|
||||
if filter_type == "butterworth":
|
||||
|
||||
def retrieve_mask(x):
|
||||
return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
|
||||
elif filter_type == "gaussian":
|
||||
|
||||
def retrieve_mask(x):
|
||||
return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
|
||||
elif filter_type == "ideal":
|
||||
|
||||
def retrieve_mask(x):
|
||||
return 1 if x <= spatial_stop_frequency * 2 else 0
|
||||
else:
|
||||
raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
|
||||
|
||||
for t in range(time):
|
||||
for h in range(height):
|
||||
for w in range(width):
|
||||
d_square = (
|
||||
((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
|
||||
+ (2 * h / height - 1) ** 2
|
||||
+ (2 * w / width - 1) ** 2
|
||||
)
|
||||
mask[..., t, h, w] = retrieve_mask(d_square)
|
||||
|
||||
return mask.to(device)
|
||||
|
||||
|
||||
def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> torch.Tensor:
|
||||
r"""Noise reinitialization."""
|
||||
# FFT
|
||||
x_freq = fft.fftn(x, dim=(-3, -2, -1))
|
||||
x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
|
||||
noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
|
||||
noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
|
||||
|
||||
# frequency mix
|
||||
HPF = 1 - LPF
|
||||
x_freq_low = x_freq * LPF
|
||||
noise_freq_high = noise_freq * HPF
|
||||
x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain
|
||||
|
||||
# IFFT
|
||||
x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
|
||||
x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
|
||||
|
||||
return x_mixed
|
||||
|
||||
|
||||
@dataclass
|
||||
class PIAPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
@@ -135,9 +202,9 @@ class PIAPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
|
||||
NumPy array of shape `(batch_size, num_frames, channels, height, width,
|
||||
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
|
||||
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
|
||||
NumPy array of shape `(batch_size, num_frames, channels, height, width,
|
||||
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
|
||||
"""
|
||||
|
||||
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
|
||||
@@ -721,8 +788,7 @@ class PIAPipeline(
|
||||
The input image to be used for video generation.
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
||||
strength (`float`, *optional*, defaults to 1.0):
|
||||
Indicates extent to transform the reference `image`. Must be between 0 and 1.
|
||||
strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1.
|
||||
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The height in pixels of the generated video.
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
@@ -789,7 +855,7 @@ class PIAPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -913,10 +979,8 @@ class PIAPipeline(
|
||||
latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
|
||||
)
|
||||
|
||||
self._num_timesteps = len(timesteps)
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
|
||||
@@ -371,7 +371,9 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
|
||||
return False
|
||||
|
||||
return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
|
||||
return hasattr(module, "_hf_hook") and not isinstance(
|
||||
module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook)
|
||||
)
|
||||
|
||||
def module_is_offloaded(module):
|
||||
if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
|
||||
@@ -937,16 +939,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
def remove_all_hooks(self):
|
||||
r"""
|
||||
Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`.
|
||||
"""
|
||||
for _, model in self.components.items():
|
||||
if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
|
||||
is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook)
|
||||
accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload)
|
||||
self._all_hooks = []
|
||||
|
||||
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
@@ -971,8 +963,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
else:
|
||||
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
||||
|
||||
self.remove_all_hooks()
|
||||
|
||||
torch_device = torch.device(device)
|
||||
device_index = torch_device.index
|
||||
|
||||
@@ -989,13 +979,15 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
device = torch.device(f"{device_type}:{self._offload_gpu_id}")
|
||||
self._offload_device = device
|
||||
|
||||
self.to("cpu", silence_dtype_warnings=True)
|
||||
device_mod = getattr(torch, device.type, None)
|
||||
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
|
||||
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
||||
if self.device.type != "cpu":
|
||||
self.to("cpu", silence_dtype_warnings=True)
|
||||
device_mod = getattr(torch, self.device.type, None)
|
||||
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
|
||||
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
||||
|
||||
all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
|
||||
|
||||
self._all_hooks = []
|
||||
hook = None
|
||||
for model_str in self.model_cpu_offload_seq.split("->"):
|
||||
model = all_model_components.pop(model_str, None)
|
||||
@@ -1029,6 +1021,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
# `enable_model_cpu_offload` has not be called, so silently do nothing
|
||||
return
|
||||
|
||||
for hook in self._all_hooks:
|
||||
# offload model and remove hook from model
|
||||
hook.offload()
|
||||
hook.remove()
|
||||
|
||||
# make sure the model is in the same state as before calling it
|
||||
self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))
|
||||
|
||||
@@ -1051,7 +1048,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
from accelerate import cpu_offload
|
||||
else:
|
||||
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
||||
self.remove_all_hooks()
|
||||
|
||||
torch_device = torch.device(device)
|
||||
device_index = torch_device.index
|
||||
|
||||
@@ -117,25 +117,25 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
|
||||
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
||||
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
||||
|
||||
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_model_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
||||
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
||||
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
||||
"""
|
||||
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
|
||||
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
|
||||
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
|
||||
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def progress_bar(self, iterable=None, total=None):
|
||||
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
||||
@@ -242,7 +242,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
|
||||
prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
|
||||
list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
|
||||
the `._callback_tensor_inputs` attribute of your pipeline class.
|
||||
the `._callback_tensor_inputs` attribute of your pipeine class.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
@@ -251,7 +251,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -469,7 +469,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
)
|
||||
|
||||
if len(gligen_phrases) != len(gligen_boxes):
|
||||
raise ValueError(
|
||||
ValueError(
|
||||
"length of `gligen_phrases` and `gligen_boxes` has to be same, but"
|
||||
f" got: `gligen_phrases` {len(gligen_phrases)} != `gligen_boxes` {len(gligen_boxes)}"
|
||||
)
|
||||
|
||||
+19
-235
@@ -13,7 +13,7 @@
|
||||
|
||||
import copy
|
||||
import inspect
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
@@ -59,66 +59,6 @@ EXAMPLE_DOC_STRING = """
|
||||
"""
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
||||
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
"""
|
||||
Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
|
||||
"""
|
||||
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
||||
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
||||
# rescale the results from guidance (fixes overexposure)
|
||||
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
|
||||
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
|
||||
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
|
||||
return noise_cfg
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
||||
def retrieve_timesteps(
|
||||
scheduler,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
||||
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
||||
|
||||
Args:
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
|
||||
timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
|
||||
must be `None`.
|
||||
|
||||
Returns:
|
||||
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
||||
second element is the number of inference steps.
|
||||
"""
|
||||
if timesteps is not None:
|
||||
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accepts_timesteps:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" timestep schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
else:
|
||||
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class StableDiffusionPanoramaPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
|
||||
):
|
||||
@@ -157,7 +97,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
model_cpu_offload_seq = "text_encoder->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -522,23 +461,10 @@ class StableDiffusionPanoramaPipeline(
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
return image
|
||||
|
||||
def decode_latents_with_padding(self, latents: torch.Tensor, padding: int = 8) -> torch.Tensor:
|
||||
"""
|
||||
Decode the given latents with padding for circular inference.
|
||||
|
||||
Args:
|
||||
latents (torch.Tensor): The input latents to decode.
|
||||
padding (int, optional): The number of latents to add on each side for padding. Defaults to 8.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The decoded image with padding removed.
|
||||
|
||||
Notes:
|
||||
- The padding is added to remove boundary artifacts and improve the output quality.
|
||||
- This would slightly increase the memory usage.
|
||||
- The padding pixels are then removed from the decoded image.
|
||||
|
||||
"""
|
||||
def decode_latents_with_padding(self, latents, padding=8):
|
||||
# Add padding to latents for circular inference
|
||||
# padding is the number of latents to add on each side
|
||||
# it would slightly increase the memory usage, but remove the boundary artifacts
|
||||
latents = 1 / self.vae.config.scaling_factor * latents
|
||||
latents_left = latents[..., :padding]
|
||||
latents_right = latents[..., -padding:]
|
||||
@@ -654,62 +580,9 @@ class StableDiffusionPanoramaPipeline(
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(
|
||||
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
w (`torch.Tensor`):
|
||||
Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
Dimension of the embeddings to generate.
|
||||
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
||||
Data type of the generated embeddings.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
def get_views(
|
||||
self,
|
||||
panorama_height: int,
|
||||
panorama_width: int,
|
||||
window_size: int = 64,
|
||||
stride: int = 8,
|
||||
circular_padding: bool = False,
|
||||
) -> List[Tuple[int, int, int, int]]:
|
||||
"""
|
||||
Generates a list of views based on the given parameters.
|
||||
Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113).
|
||||
If panorama's height/width < window_size, num_blocks of height/width should return 1.
|
||||
|
||||
Args:
|
||||
panorama_height (int): The height of the panorama.
|
||||
panorama_width (int): The width of the panorama.
|
||||
window_size (int, optional): The size of the window. Defaults to 64.
|
||||
stride (int, optional): The stride value. Defaults to 8.
|
||||
circular_padding (bool, optional): Whether to apply circular padding. Defaults to False.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains
|
||||
four integers representing the start and end coordinates of the window in the panorama.
|
||||
|
||||
"""
|
||||
def get_views(self, panorama_height, panorama_width, window_size=64, stride=8, circular_padding=False):
|
||||
# Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
|
||||
# if panorama's height/width < window_size, num_blocks of height/width should return 1
|
||||
panorama_height /= 8
|
||||
panorama_width /= 8
|
||||
num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1
|
||||
@@ -727,34 +600,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
views.append((h_start, h_end, w_start, w_end))
|
||||
return views
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def guidance_rescale(self):
|
||||
return self._guidance_rescale
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
return self._cross_attention_kwargs
|
||||
|
||||
@property
|
||||
def clip_skip(self):
|
||||
return self._clip_skip
|
||||
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
@@ -763,7 +608,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 2048,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: List[int] = None,
|
||||
guidance_scale: float = 7.5,
|
||||
view_batch_size: int = 1,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -777,13 +621,11 @@ class StableDiffusionPanoramaPipeline(
|
||||
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
guidance_rescale: float = 0.0,
|
||||
circular_padding: bool = False,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs: Any,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for generation.
|
||||
@@ -799,9 +641,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
The timesteps at which to generate the images. If not specified, then the default
|
||||
timestep spacing strategy of the scheduler is used.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
A higher guidance scale value encourages the model to generate images closely linked to the text
|
||||
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
||||
@@ -841,12 +680,16 @@ class StableDiffusionPanoramaPipeline(
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that calls every `callback_steps` steps during inference. The function is called with the
|
||||
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
||||
every step.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
||||
`self.processor` in
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
A rescaling factor for the guidance embeddings. A value of 0.0 means no rescaling is applied.
|
||||
circular_padding (`bool`, *optional*, defaults to `False`):
|
||||
If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular
|
||||
padding allows the model to seamlessly generate a transition from the rightmost part of the image to
|
||||
@@ -854,15 +697,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
@@ -872,22 +706,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
second element is a list of `bool`s indicating whether the corresponding generated image contains
|
||||
"not-safe-for-work" (nsfw) content.
|
||||
"""
|
||||
callback = kwargs.pop("callback", None)
|
||||
callback_steps = kwargs.pop("callback_steps", None)
|
||||
|
||||
if callback is not None:
|
||||
deprecate(
|
||||
"callback",
|
||||
"1.0.0",
|
||||
"Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
if callback_steps is not None:
|
||||
deprecate(
|
||||
"callback_steps",
|
||||
"1.0.0",
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
||||
@@ -903,15 +721,8 @@ class StableDiffusionPanoramaPipeline(
|
||||
negative_prompt_embeds,
|
||||
ip_adapter_image,
|
||||
ip_adapter_image_embeds,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
)
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._guidance_rescale = guidance_rescale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -957,7 +768,8 @@ class StableDiffusionPanoramaPipeline(
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
# 4. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
# 5. Prepare latent variables
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
@@ -990,23 +802,12 @@ class StableDiffusionPanoramaPipeline(
|
||||
else None
|
||||
)
|
||||
|
||||
# 7.2 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 8. Denoising loop
|
||||
# Each denoising step also includes refinement of the latents with respect to the
|
||||
# views.
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
count.zero_()
|
||||
value.zero_()
|
||||
|
||||
@@ -1062,7 +863,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds_input,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
).sample
|
||||
@@ -1072,12 +872,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_denoised_batch = self.scheduler.step(
|
||||
noise_pred, t, latents_for_view, **extra_step_kwargs
|
||||
@@ -1107,16 +901,6 @@ class StableDiffusionPanoramaPipeline(
|
||||
# take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
|
||||
latents = torch.where(count > 0, value / count, value)
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
@@ -1124,7 +908,7 @@ class StableDiffusionPanoramaPipeline(
|
||||
step_idx = i // getattr(self.scheduler, "order", 1)
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if output_type != "latent":
|
||||
if not output_type == "latent":
|
||||
if circular_padding:
|
||||
image = self.decode_latents_with_padding(latents)
|
||||
else:
|
||||
|
||||
@@ -1193,16 +1193,7 @@ class StableDiffusionXLPipeline(
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
@@ -1237,14 +1228,6 @@ class StableDiffusionXLPipeline(
|
||||
if needs_upcasting:
|
||||
self.upcast_vae()
|
||||
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
elif latents.dtype != self.vae.dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
self.vae = self.vae.to(latents.dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
# unscale/denormalize the latents
|
||||
# denormalize with the mean and std if available and not None
|
||||
|
||||
+1
-18
@@ -647,7 +647,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -1370,16 +1370,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
@@ -1414,14 +1405,6 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
if needs_upcasting:
|
||||
self.upcast_vae()
|
||||
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
elif latents.dtype != self.vae.dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
self.vae = self.vae.to(latents.dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
# unscale/denormalize the latents
|
||||
# denormalize with the mean and std if available and not None
|
||||
|
||||
+1
-18
@@ -1027,7 +1027,7 @@ class StableDiffusionXLInpaintPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -1720,16 +1720,7 @@ class StableDiffusionXLInpaintPipeline(
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
if num_channels_unet == 4:
|
||||
init_latents_proper = image_latents
|
||||
@@ -1781,14 +1772,6 @@ class StableDiffusionXLInpaintPipeline(
|
||||
if needs_upcasting:
|
||||
self.upcast_vae()
|
||||
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
elif latents.dtype != self.vae.dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
self.vae = self.vae.to(latents.dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
# unscale/denormalize the latents
|
||||
# denormalize with the mean and std if available and not None
|
||||
|
||||
-17
@@ -918,16 +918,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
@@ -946,14 +937,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
||||
if needs_upcasting:
|
||||
self.upcast_vae()
|
||||
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
||||
elif latents.dtype != self.vae.dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
self.vae = self.vae.to(latents.dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
"For the given accelerator, there seems to be an unexpected problem in type-casting. Please file an issue on the PyTorch GitHub repository. See also: https://github.com/huggingface/diffusers/pull/7446/."
|
||||
)
|
||||
|
||||
# unscale/denormalize the latents
|
||||
# denormalize with the mean and std if available and not None
|
||||
|
||||
@@ -112,25 +112,25 @@ class WuerstchenCombinedPipeline(DiffusionPipeline):
|
||||
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
||||
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
||||
|
||||
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_model_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
||||
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
||||
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
||||
"""
|
||||
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
|
||||
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
|
||||
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
|
||||
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
||||
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
||||
|
||||
def progress_bar(self, iterable=None, total=None):
|
||||
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
||||
|
||||
@@ -69,7 +69,6 @@ from .import_utils import (
|
||||
is_note_seq_available,
|
||||
is_onnx_available,
|
||||
is_peft_available,
|
||||
is_peft_version,
|
||||
is_scipy_available,
|
||||
is_tensorboard_available,
|
||||
is_torch_available,
|
||||
|
||||
@@ -628,20 +628,6 @@ def is_accelerate_version(operation: str, version: str):
|
||||
return compare_versions(parse(_accelerate_version), operation, version)
|
||||
|
||||
|
||||
def is_peft_version(operation: str, version: str):
|
||||
"""
|
||||
Args:
|
||||
Compares the current PEFT version to a given reference with an operation.
|
||||
operation (`str`):
|
||||
A string representation of an operator, such as `">"` or `"<="`
|
||||
version (`str`):
|
||||
A version string
|
||||
"""
|
||||
if not _peft_version:
|
||||
return False
|
||||
return compare_versions(parse(_peft_version), operation, version)
|
||||
|
||||
|
||||
def is_k_diffusion_version(operation: str, version: str):
|
||||
"""
|
||||
Args:
|
||||
|
||||
@@ -171,7 +171,6 @@ def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True
|
||||
|
||||
# layer names without the Diffusers specific
|
||||
target_modules = list({name.split(".lora")[0] for name in peft_state_dict.keys()})
|
||||
use_dora = any("lora_magnitude_vector" in k for k in peft_state_dict)
|
||||
|
||||
lora_config_kwargs = {
|
||||
"r": r,
|
||||
@@ -179,7 +178,6 @@ def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True
|
||||
"rank_pattern": rank_pattern,
|
||||
"alpha_pattern": alpha_pattern,
|
||||
"target_modules": target_modules,
|
||||
"use_dora": use_dora,
|
||||
}
|
||||
return lora_config_kwargs
|
||||
|
||||
|
||||
@@ -47,7 +47,6 @@ UNET_TO_DIFFUSERS = {
|
||||
".to_v_lora.up": ".to_v.lora_B",
|
||||
".lora.up": ".lora_B",
|
||||
".lora.down": ".lora_A",
|
||||
".to_out.lora_magnitude_vector": ".to_out.0.lora_magnitude_vector",
|
||||
}
|
||||
|
||||
|
||||
@@ -105,10 +104,6 @@ DIFFUSERS_OLD_TO_DIFFUSERS = {
|
||||
".to_v_lora.down": ".v_proj.lora_linear_layer.down",
|
||||
".to_out_lora.up": ".out_proj.lora_linear_layer.up",
|
||||
".to_out_lora.down": ".out_proj.lora_linear_layer.down",
|
||||
".to_k.lora_magnitude_vector": ".k_proj.lora_magnitude_vector",
|
||||
".to_v.lora_magnitude_vector": ".v_proj.lora_magnitude_vector",
|
||||
".to_q.lora_magnitude_vector": ".q_proj.lora_magnitude_vector",
|
||||
".to_out.lora_magnitude_vector": ".out_proj.lora_magnitude_vector",
|
||||
}
|
||||
|
||||
PEFT_TO_KOHYA_SS = {
|
||||
@@ -320,9 +315,6 @@ def convert_state_dict_to_kohya(state_dict, original_type=None, **kwargs):
|
||||
kohya_key = kohya_key.replace("text_encoder.", "lora_te1.")
|
||||
elif "unet" in kohya_key:
|
||||
kohya_key = kohya_key.replace("unet", "lora_unet")
|
||||
elif "lora_magnitude_vector" in kohya_key:
|
||||
kohya_key = kohya_key.replace("lora_magnitude_vector", "dora_scale")
|
||||
|
||||
kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
|
||||
kohya_key = kohya_key.replace(peft_adapter_name, "") # Kohya doesn't take names
|
||||
kohya_ss_state_dict[kohya_key] = weight
|
||||
|
||||
@@ -14,6 +14,7 @@ import time
|
||||
import unittest
|
||||
import urllib.parse
|
||||
from contextlib import contextmanager
|
||||
from distutils.util import strtobool
|
||||
from io import BytesIO, StringIO
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
@@ -141,22 +142,6 @@ def get_tests_dir(append_path=None):
|
||||
return tests_dir
|
||||
|
||||
|
||||
# Taken from the following PR:
|
||||
# https://github.com/huggingface/accelerate/pull/1964
|
||||
def str_to_bool(value) -> int:
|
||||
"""
|
||||
Converts a string representation of truth to `True` (1) or `False` (0).
|
||||
True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
|
||||
"""
|
||||
value = value.lower()
|
||||
if value in ("y", "yes", "t", "true", "on", "1"):
|
||||
return 1
|
||||
elif value in ("n", "no", "f", "false", "off", "0"):
|
||||
return 0
|
||||
else:
|
||||
raise ValueError(f"invalid truth value {value}")
|
||||
|
||||
|
||||
def parse_flag_from_env(key, default=False):
|
||||
try:
|
||||
value = os.environ[key]
|
||||
@@ -166,7 +151,7 @@ def parse_flag_from_env(key, default=False):
|
||||
else:
|
||||
# KEY is set, convert it to True or False.
|
||||
try:
|
||||
_value = str_to_bool(value)
|
||||
_value = strtobool(value)
|
||||
except ValueError:
|
||||
# More values are supported, but let's keep the message simple.
|
||||
raise ValueError(f"If set, {key} must be yes or no.")
|
||||
@@ -339,15 +324,10 @@ def deprecate_after_peft_backend(test_case):
|
||||
return unittest.skipUnless(not USE_PEFT_BACKEND, "test skipped in favor of PEFT backend")(test_case)
|
||||
|
||||
|
||||
def get_python_version():
|
||||
sys_info = sys.version_info
|
||||
major, minor = sys_info.major, sys_info.minor
|
||||
return major, minor
|
||||
|
||||
|
||||
def require_python39_or_higher(test_case):
|
||||
def python39_available():
|
||||
major, minor = get_python_version()
|
||||
sys_info = sys.version_info
|
||||
major, minor = sys_info.major, sys_info.minor
|
||||
return major == 3 and minor >= 9
|
||||
|
||||
return unittest.skipUnless(python39_available(), "test requires Python 3.9 or higher")(test_case)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -81,11 +81,6 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
|
||||
"latent_channels": 4,
|
||||
}
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
@@ -155,11 +150,6 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
|
||||
@require_torch_gpu
|
||||
@require_peft_backend
|
||||
class LoraIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -90,11 +90,6 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
|
||||
"sample_size": 128,
|
||||
}
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
@@ -105,11 +100,6 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
|
||||
@require_torch_gpu
|
||||
@require_peft_backend
|
||||
class LoraSDXLIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
@@ -640,21 +630,3 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
|
||||
expected_slice_scale = np.array([0.5456, 0.5466, 0.5487, 0.5458, 0.5469, 0.5454, 0.5446, 0.5479, 0.5487])
|
||||
max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
|
||||
assert max_diff < 1e-3
|
||||
|
||||
@nightly
|
||||
def test_integration_logits_for_dora_lora(self):
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
|
||||
pipeline.load_lora_weights("hf-internal-testing/dora-trained-on-kohya")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
images = pipeline(
|
||||
"photo of ohwx dog",
|
||||
num_inference_steps=10,
|
||||
generator=torch.manual_seed(0),
|
||||
output_type="np",
|
||||
).images
|
||||
|
||||
predicted_slice = images[0, -3:, -3:, -1].flatten()
|
||||
expected_slice_scale = np.array([0.3932, 0.3742, 0.4429, 0.3737, 0.3504, 0.433, 0.3948, 0.3769, 0.4516])
|
||||
max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
|
||||
assert max_diff < 1e-3
|
||||
|
||||
+3
-41
@@ -31,7 +31,6 @@ from diffusers.utils.testing_utils import (
|
||||
floats_tensor,
|
||||
require_peft_backend,
|
||||
require_peft_version_greater,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -73,8 +72,8 @@ class PeftLoraLoaderMixinTests:
|
||||
unet_kwargs = None
|
||||
vae_kwargs = None
|
||||
|
||||
def get_dummy_components(self, scheduler_cls=None, use_dora=False):
|
||||
scheduler_cls = self.scheduler_cls if scheduler_cls is None else scheduler_cls
|
||||
def get_dummy_components(self, scheduler_cls=None):
|
||||
scheduler_cls = self.scheduler_cls if scheduler_cls is None else LCMScheduler
|
||||
rank = 4
|
||||
|
||||
torch.manual_seed(0)
|
||||
@@ -97,15 +96,10 @@ class PeftLoraLoaderMixinTests:
|
||||
lora_alpha=rank,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
init_lora_weights=False,
|
||||
use_dora=use_dora,
|
||||
)
|
||||
|
||||
unet_lora_config = LoraConfig(
|
||||
r=rank,
|
||||
lora_alpha=rank,
|
||||
target_modules=["to_q", "to_k", "to_v", "to_out.0"],
|
||||
init_lora_weights=False,
|
||||
use_dora=use_dora,
|
||||
r=rank, lora_alpha=rank, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False
|
||||
)
|
||||
|
||||
if self.has_two_text_encoders:
|
||||
@@ -924,7 +918,6 @@ class PeftLoraLoaderMixinTests:
|
||||
"output with no lora and output with lora disabled should give same results",
|
||||
)
|
||||
|
||||
@skip_mps
|
||||
def test_lora_fuse_nan(self):
|
||||
for scheduler_cls in [DDIMScheduler, LCMScheduler]:
|
||||
components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls)
|
||||
@@ -1081,37 +1074,6 @@ class PeftLoraLoaderMixinTests:
|
||||
"Fused lora should not change the output",
|
||||
)
|
||||
|
||||
@require_peft_version_greater(peft_version="0.9.0")
|
||||
def test_simple_inference_with_dora(self):
|
||||
for scheduler_cls in [DDIMScheduler, LCMScheduler]:
|
||||
components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls, use_dora=True)
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
_, _, inputs = self.get_dummy_inputs(with_generator=False)
|
||||
|
||||
output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
|
||||
self.assertTrue(output_no_dora_lora.shape == (1, 64, 64, 3))
|
||||
|
||||
pipe.text_encoder.add_adapter(text_lora_config)
|
||||
pipe.unet.add_adapter(unet_lora_config)
|
||||
|
||||
self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
|
||||
self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet")
|
||||
|
||||
if self.has_two_text_encoders:
|
||||
pipe.text_encoder_2.add_adapter(text_lora_config)
|
||||
self.assertTrue(
|
||||
check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
|
||||
)
|
||||
|
||||
output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
|
||||
|
||||
self.assertFalse(
|
||||
np.allclose(output_dora_lora, output_no_dora_lora, atol=1e-3, rtol=1e-3),
|
||||
"DoRA lora should change the output",
|
||||
)
|
||||
|
||||
@unittest.skip("This is failing for now - need to investigate")
|
||||
def test_simple_inference_with_text_unet_lora_unfused_torch_compile(self):
|
||||
"""
|
||||
|
||||
@@ -392,7 +392,7 @@ class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase):
|
||||
...
|
||||
|
||||
|
||||
class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase):
|
||||
class AutoncoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = AutoencoderKLTemporalDecoder
|
||||
main_input_name = "sample"
|
||||
base_precision = 1e-2
|
||||
@@ -1017,12 +1017,6 @@ class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
@@ -1122,36 +1116,3 @@ class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
|
||||
)
|
||||
|
||||
assert torch_all_close(actual_output, expected_output, atol=5e-3)
|
||||
|
||||
def test_vae_tiling(self):
|
||||
vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
out_1 = pipe(
|
||||
"horse",
|
||||
num_inference_steps=2,
|
||||
output_type="pt",
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
# make sure tiled vae decode yields the same result
|
||||
pipe.enable_vae_tiling()
|
||||
out_2 = pipe(
|
||||
"horse",
|
||||
num_inference_steps=2,
|
||||
output_type="pt",
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
assert torch_all_close(out_1, out_2, atol=5e-3)
|
||||
|
||||
# test that tiled decode works with various shapes
|
||||
shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
|
||||
with torch.no_grad():
|
||||
for shape in shapes:
|
||||
image = torch.zeros(shape, device=torch_device)
|
||||
pipe.vae.decode(image)
|
||||
|
||||
@@ -34,7 +34,6 @@ from diffusers.training_utils import EMAModel
|
||||
from diffusers.utils import is_xformers_available, logging
|
||||
from diffusers.utils.testing_utils import (
|
||||
CaptureLogger,
|
||||
get_python_version,
|
||||
require_python39_or_higher,
|
||||
require_torch_2,
|
||||
require_torch_accelerator_with_training,
|
||||
@@ -432,10 +431,6 @@ class ModelTesterMixin:
|
||||
|
||||
@require_python39_or_higher
|
||||
@require_torch_2
|
||||
@unittest.skipIf(
|
||||
get_python_version == (3, 12),
|
||||
reason="Torch Dynamo isn't yet supported for Python 3.12.",
|
||||
)
|
||||
def test_from_save_pretrained_dynamo(self):
|
||||
init_dict, _ = self.prepare_init_args_and_inputs_for_common()
|
||||
inputs = [init_dict, self.model_class]
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from distutils.util import strtobool
|
||||
|
||||
import pytest
|
||||
|
||||
from diffusers import __version__
|
||||
from diffusers.utils import deprecate
|
||||
from diffusers.utils.testing_utils import str_to_bool
|
||||
|
||||
|
||||
# Used to test the hub
|
||||
@@ -191,7 +191,7 @@ def parse_flag_from_env(key, default=False):
|
||||
else:
|
||||
# KEY is set, convert it to True or False.
|
||||
try:
|
||||
_value = str_to_bool(value)
|
||||
_value = strtobool(value)
|
||||
except ValueError:
|
||||
# More values are supported, but let's keep the message simple.
|
||||
raise ValueError(f"If set, {key} must be yes or no.")
|
||||
|
||||
@@ -303,12 +303,6 @@ class AnimateDiffPipelineFastTests(
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class AnimateDiffPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
|
||||
@@ -371,11 +371,6 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
@nightly
|
||||
class AudioLDMPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
@@ -416,11 +411,6 @@ class AudioLDMPipelineSlowTests(unittest.TestCase):
|
||||
|
||||
@nightly
|
||||
class AudioLDMPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -493,11 +493,6 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
@nightly
|
||||
class AudioLDM2PipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -170,11 +170,6 @@ class ConsistencyModelPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
class ConsistencyModelPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -35,7 +35,6 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetMo
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
get_python_version,
|
||||
load_image,
|
||||
load_numpy,
|
||||
numpy_cosine_similarity_distance,
|
||||
@@ -494,15 +493,6 @@ class StableDiffusionMultiControlNetPipelineFastTests(
|
||||
|
||||
assert np.abs(image - output_1.images).max() < 1e-3
|
||||
|
||||
# multiple prompts, multiple image conditioning
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["prompt"] = [inputs["prompt"], inputs["prompt"], inputs["prompt"], inputs["prompt"]]
|
||||
inputs["image"] = [inputs["image"], inputs["image"], inputs["image"], inputs["image"]]
|
||||
output_2 = sd_pipe(**inputs)
|
||||
image = output_2.images
|
||||
|
||||
assert image.shape == (4, 64, 64, 3)
|
||||
|
||||
|
||||
class StableDiffusionMultiControlNetOneModelPipelineFastTests(
|
||||
IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
|
||||
@@ -684,11 +674,6 @@ class StableDiffusionMultiControlNetOneModelPipelineFastTests(
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class ControlNetPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
@@ -1007,10 +992,6 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
|
||||
|
||||
@require_python39_or_higher
|
||||
@require_torch_2
|
||||
@unittest.skipIf(
|
||||
get_python_version == (3, 12),
|
||||
reason="Torch Dynamo isn't yet supported for Python 3.12.",
|
||||
)
|
||||
def test_stable_diffusion_compile(self):
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
|
||||
|
||||
@@ -1140,11 +1121,6 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -382,11 +382,6 @@ class StableDiffusionMultiControlNetPipelineFastTests(
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -445,11 +445,6 @@ class MultiControlNetInpaintPipelineFastTests(
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -884,11 +884,6 @@ class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -118,12 +118,6 @@ class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
class PipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
|
||||
@@ -115,10 +115,6 @@ class IFImg2ImgPipelineSlowTests(unittest.TestCase):
|
||||
pipe.unet.set_attn_processor(AttnAddedKVProcessor())
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
torch.cuda.reset_max_memory_allocated()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
output = pipe(
|
||||
|
||||
@@ -113,10 +113,6 @@ class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase):
|
||||
pipe.unet.set_attn_processor(AttnAddedKVProcessor())
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
torch.cuda.reset_max_memory_allocated()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device)
|
||||
|
||||
@@ -111,6 +111,7 @@ class IFInpaintingPipelineSlowTests(unittest.TestCase):
|
||||
pipe.unet.set_attn_processor(AttnAddedKVProcessor())
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
# Super resolution test
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_max_memory_allocated()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
@@ -109,11 +109,6 @@ class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
class DiTPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
|
||||
@@ -229,12 +229,6 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
class I2VGenXLPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
|
||||
@@ -365,12 +365,6 @@ class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
class KandinskyImg2ImgPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
|
||||
@@ -27,6 +27,7 @@ from diffusers.utils.testing_utils import (
|
||||
load_numpy,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
@@ -257,7 +258,7 @@ class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
|
||||
image_emb, zero_image_emb = pipe_prior(
|
||||
prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=3,
|
||||
num_inference_steps=5,
|
||||
negative_prompt="",
|
||||
).to_tuple()
|
||||
|
||||
@@ -266,7 +267,7 @@ class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
|
||||
image_embeds=image_emb,
|
||||
negative_image_embeds=zero_image_emb,
|
||||
generator=generator,
|
||||
num_inference_steps=3,
|
||||
num_inference_steps=100,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ from diffusers.utils.testing_utils import (
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
@@ -269,7 +270,7 @@ class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
|
||||
image_emb, zero_image_emb = pipe_prior(
|
||||
prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
num_inference_steps=5,
|
||||
negative_prompt="",
|
||||
).to_tuple()
|
||||
|
||||
@@ -279,7 +280,7 @@ class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase):
|
||||
negative_image_embeds=zero_image_emb,
|
||||
hint=hint,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
num_inference_steps=100,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@ from diffusers.utils.testing_utils import (
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_gpu,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
@@ -286,7 +287,6 @@ class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
strength=0.85,
|
||||
generator=generator,
|
||||
negative_prompt="",
|
||||
num_inference_steps=5,
|
||||
).to_tuple()
|
||||
|
||||
output = pipeline(
|
||||
@@ -295,7 +295,7 @@ class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
negative_image_embeds=zero_image_emb,
|
||||
hint=hint,
|
||||
generator=generator,
|
||||
num_inference_steps=5,
|
||||
num_inference_steps=100,
|
||||
height=512,
|
||||
width=512,
|
||||
strength=0.5,
|
||||
|
||||
@@ -35,6 +35,7 @@ from diffusers.utils.testing_utils import (
|
||||
load_numpy,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
@@ -287,7 +288,7 @@ class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
image_embeds=image_emb,
|
||||
negative_image_embeds=zero_image_emb,
|
||||
generator=generator,
|
||||
num_inference_steps=5,
|
||||
num_inference_steps=100,
|
||||
height=768,
|
||||
width=768,
|
||||
strength=0.2,
|
||||
|
||||
@@ -334,7 +334,7 @@ class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
|
||||
image_emb, zero_image_emb = pipe_prior(
|
||||
prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
num_inference_steps=5,
|
||||
negative_prompt="",
|
||||
).to_tuple()
|
||||
|
||||
@@ -344,7 +344,7 @@ class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
|
||||
image_embeds=image_emb,
|
||||
negative_image_embeds=zero_image_emb,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
num_inference_steps=100,
|
||||
height=768,
|
||||
width=768,
|
||||
output_type="np",
|
||||
|
||||
@@ -192,7 +192,7 @@ class Kandinsky3PipelineIntegrationTests(unittest.TestCase):
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
image = pipe(prompt, num_inference_steps=5, generator=generator).images[0]
|
||||
image = pipe(prompt, num_inference_steps=25, generator=generator).images[0]
|
||||
|
||||
assert image.size == (1024, 1024)
|
||||
|
||||
@@ -223,7 +223,7 @@ class Kandinsky3PipelineIntegrationTests(unittest.TestCase):
|
||||
image = image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
|
||||
prompt = "A painting of the inside of a subway train with tiny raccoons."
|
||||
|
||||
image = pipe(prompt, image=image, strength=0.75, num_inference_steps=5, generator=generator).images[0]
|
||||
image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
|
||||
|
||||
assert image.size == (512, 512)
|
||||
|
||||
|
||||
@@ -215,7 +215,7 @@ class Kandinsky3Img2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
image = image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
|
||||
prompt = "A painting of the inside of a subway train with tiny raccoons."
|
||||
|
||||
image = pipe(prompt, image=image, strength=0.75, num_inference_steps=5, generator=generator).images[0]
|
||||
image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
|
||||
|
||||
assert image.size == (512, 512)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user