Compare commits
99 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e3f6ba3fc1 | |||
| f442955c6e | |||
| ff9a387618 | |||
| fb8722e9ab | |||
| 512044c5ea | |||
| 03c3f69aa5 | |||
| f20aba3e87 | |||
| 6c85fcd899 | |||
| 085e9cba36 | |||
| 919ee1aee3 | |||
| 9cda45701c | |||
| c678e8a445 | |||
| ccf2c31188 | |||
| d1342d7464 | |||
| 7b10e4ae65 | |||
| 3c0531bc50 | |||
| a8e47978c6 | |||
| 50e18ee698 | |||
| 4b17fa2a2e | |||
| d45199a2f1 | |||
| 061163142d | |||
| 5780776c8a | |||
| f19421e27c | |||
| 9a0cc463ee | |||
| ef4e373a65 | |||
| 1b4af6b7ef | |||
| 69cdc25746 | |||
| cfd6ec7465 | |||
| ea77fdc4b4 | |||
| 1082c46afa | |||
| ba2ba9019f | |||
| fa4c0e5e2e | |||
| b793debd9d | |||
| 377057126c | |||
| 5937e11d85 | |||
| 9c1d4e3be1 | |||
| 7ea065c507 | |||
| 7a7a487396 | |||
| 4efb4db9d0 | |||
| 639fd12a20 | |||
| 69a9828f4d | |||
| 11d22e0e80 | |||
| 9a38fab5ae | |||
| cb8e61ed2f | |||
| 8e53cd959e | |||
| 359b605f4b | |||
| 6febc08bfc | |||
| 9a2eaed002 | |||
| 0c71189abe | |||
| 58d2b10a2e | |||
| 20e0740b88 | |||
| 9d313fc718 | |||
| f83dd5c984 | |||
| c052791b5f | |||
| 843e3f9346 | |||
| 255c5742aa | |||
| 4524d43279 | |||
| b6dc0b75f4 | |||
| d8854b8d54 | |||
| 966a2ff8df | |||
| 327e251b81 | |||
| dfa48831e2 | |||
| 94df8ef68a | |||
| 203dc520a7 | |||
| 56d4387270 | |||
| edcbe8038b | |||
| c02c4a6d27 | |||
| 6f3ac3050f | |||
| a6d9f6a1a9 | |||
| 284150449d | |||
| 3d2f8ae99b | |||
| 201da97dd0 | |||
| f36ba9f094 | |||
| 1c50a5f7e0 | |||
| 7ae6347e33 | |||
| 178d32dedd | |||
| ef1e628729 | |||
| 173e1b147d | |||
| e46e139f95 | |||
| 4423097b23 | |||
| 14725164be | |||
| 638cc035e5 | |||
| 60d1b81023 | |||
| 9db9be65f3 | |||
| d87134ada4 | |||
| 67a8ec8bf5 | |||
| cde02b061b | |||
| 5dc503aa28 | |||
| c6fbcf717b | |||
| b9e99654e1 | |||
| 478df933c3 | |||
| 18c8f10f20 | |||
| 7298bdd817 | |||
| 9c13f86579 | |||
| 5c5209720e | |||
| aa14f090f8 | |||
| c5d6e0b537 | |||
| 39831599f1 | |||
| b73c738392 |
@@ -25,7 +25,7 @@ jobs:
|
||||
group: aws-g6e-4xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
|
||||
@@ -79,14 +79,14 @@ jobs:
|
||||
|
||||
# Check secret is set
|
||||
- name: whoami
|
||||
run: huggingface-cli whoami
|
||||
run: hf auth whoami
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
|
||||
|
||||
# Push to HF! (under subfolder based on checkout ref)
|
||||
# https://huggingface.co/datasets/diffusers/community-pipelines-mirror
|
||||
- name: Mirror community pipeline to HF
|
||||
run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
|
||||
run: hf upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
|
||||
env:
|
||||
PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
|
||||
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -222,7 +222,7 @@ jobs:
|
||||
group: aws-g6e-xlarge-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -270,7 +270,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-minimum-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -333,7 +333,7 @@ jobs:
|
||||
additional_deps: ["peft"]
|
||||
- backend: "gguf"
|
||||
test_location: "gguf"
|
||||
additional_deps: ["peft"]
|
||||
additional_deps: ["peft", "kernels"]
|
||||
- backend: "torchao"
|
||||
test_location: "torchao"
|
||||
additional_deps: []
|
||||
@@ -344,7 +344,7 @@ jobs:
|
||||
group: aws-g6e-xlarge-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "20gb" --ipc host --gpus 0
|
||||
options: --shm-size "20gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -396,7 +396,7 @@ jobs:
|
||||
group: aws-g6e-xlarge-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "20gb" --ipc host --gpus 0
|
||||
options: --shm-size "20gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
name: Fast PR tests for Modular
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "src/diffusers/modular_pipelines/**.py"
|
||||
- "src/diffusers/models/modeling_utils.py"
|
||||
- "src/diffusers/models/model_loading_utils.py"
|
||||
- "src/diffusers/pipelines/pipeline_utils.py"
|
||||
- "src/diffusers/pipeline_loading_utils.py"
|
||||
- "src/diffusers/loaders/lora_base.py"
|
||||
- "src/diffusers/loaders/lora_pipeline.py"
|
||||
- "src/diffusers/loaders/peft.py"
|
||||
- "tests/modular_pipelines/**.py"
|
||||
- ".github/**.yml"
|
||||
- "utils/**.py"
|
||||
- "setup.py"
|
||||
push:
|
||||
branches:
|
||||
- ci-*
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 1
|
||||
OMP_NUM_THREADS: 4
|
||||
MKL_NUM_THREADS: 4
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
python utils/check_support_list.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Fast PyTorch Modular Pipeline CPU tests
|
||||
framework: pytorch_pipelines
|
||||
runner: aws-highmemory-32-plus
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu_modular_pipelines
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on:
|
||||
group: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
|
||||
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch Pipeline CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/modular_pipelines
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: pr_${{ matrix.config.framework }}_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ on:
|
||||
- "src/diffusers/loaders/peft.py"
|
||||
- "tests/pipelines/test_pipelines_common.py"
|
||||
- "tests/models/test_modeling_common.py"
|
||||
- "examples/**/*.py"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
@@ -117,7 +118,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -182,7 +183,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -252,7 +253,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -109,7 +109,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -167,7 +167,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -210,7 +210,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-xformers-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -252,7 +252,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-minimum-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
options: --shm-size "16gb" --ipc host --gpus all
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -222,7 +222,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -265,7 +265,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-xformers-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -307,7 +307,7 @@ jobs:
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
options: --gpus all --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
|
||||
@@ -30,7 +30,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Validate test files input
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
group: "${{ github.event.inputs.runner_type }}"
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus all --privileged
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
|
||||
@@ -31,7 +31,7 @@ pip install -r requirements.txt
|
||||
We need to be authenticated to access some of the checkpoints used during benchmarking:
|
||||
|
||||
```sh
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
We use an L40 GPU with 128GB RAM to run the benchmark CI. As such, the benchmarks are configured to run on NVIDIA GPUs. So, make sure you have access to a similar machine (or modify the benchmarking scripts accordingly).
|
||||
|
||||
+195
-174
@@ -1,36 +1,39 @@
|
||||
- sections:
|
||||
- title: Get started
|
||||
sections:
|
||||
- local: index
|
||||
title: 🧨 Diffusers
|
||||
title: Diffusers
|
||||
- local: installation
|
||||
title: Installation
|
||||
- local: quicktour
|
||||
title: Quicktour
|
||||
- local: stable_diffusion
|
||||
title: Effective and efficient diffusion
|
||||
- local: installation
|
||||
title: Installation
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: tutorials/tutorial_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/write_own_pipeline
|
||||
title: Understanding pipelines, models and schedulers
|
||||
- local: tutorials/autopipeline
|
||||
title: AutoPipeline
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
title: Tutorials
|
||||
- sections:
|
||||
|
||||
- title: DiffusionPipeline
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines
|
||||
- local: tutorials/autopipeline
|
||||
title: AutoPipeline
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reproducible pipelines
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/scheduler_features
|
||||
title: Scheduler features
|
||||
- local: using-diffusers/other-formats
|
||||
title: Model files and layouts
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Load pipelines and adapters
|
||||
- sections:
|
||||
|
||||
- title: Adapters
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: tutorials/using_peft_for_inference
|
||||
title: LoRA
|
||||
- local: using-diffusers/ip_adapter
|
||||
@@ -43,25 +46,12 @@
|
||||
title: DreamBooth
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
title: Adapters
|
||||
|
||||
- title: Inference
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Video generation
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Generative tasks
|
||||
- sections:
|
||||
- local: using-diffusers/overview_techniques
|
||||
title: Overview
|
||||
sections:
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt techniques
|
||||
- local: using-diffusers/create_a_server
|
||||
title: Create a server
|
||||
- local: using-diffusers/batched_inference
|
||||
@@ -76,14 +66,38 @@
|
||||
title: Reproducible pipelines
|
||||
- local: using-diffusers/image_quality
|
||||
title: Controlling image quality
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt techniques
|
||||
title: Inference techniques
|
||||
- sections:
|
||||
- local: advanced_inference/outpaint
|
||||
title: Outpainting
|
||||
title: Advanced inference
|
||||
- sections:
|
||||
|
||||
- title: Inference optimization
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: optimization/fp16
|
||||
title: Accelerate inference
|
||||
- local: optimization/cache
|
||||
title: Caching
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/speed-memory-optims
|
||||
title: Compile and offloading quantized models
|
||||
- title: Community optimizations
|
||||
sections:
|
||||
- local: optimization/pruna
|
||||
title: Pruna
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
- local: optimization/tgate
|
||||
title: TGATE
|
||||
- local: optimization/xdit
|
||||
title: xDiT
|
||||
- local: optimization/para_attn
|
||||
title: ParaAttention
|
||||
|
||||
- title: Hybrid Inference
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: hybrid_inference/overview
|
||||
title: Overview
|
||||
- local: hybrid_inference/vae_decode
|
||||
@@ -92,8 +106,10 @@
|
||||
title: VAE Encode
|
||||
- local: hybrid_inference/api_reference
|
||||
title: API Reference
|
||||
title: Hybrid Inference
|
||||
- sections:
|
||||
|
||||
- title: Modular Diffusers
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: modular_diffusers/overview
|
||||
title: Overview
|
||||
- local: modular_diffusers/modular_pipeline
|
||||
@@ -112,8 +128,88 @@
|
||||
title: Auto Pipeline Blocks
|
||||
- local: modular_diffusers/end_to_end_guide
|
||||
title: End-to-End Example
|
||||
title: Modular Diffusers
|
||||
- sections:
|
||||
|
||||
- title: Training
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
- title: Models
|
||||
sections:
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: training/cogvideox
|
||||
title: CogVideoX
|
||||
- title: Methods
|
||||
sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
|
||||
- title: Quantization
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: quantization/overview
|
||||
title: Getting started
|
||||
- local: quantization/bitsandbytes
|
||||
title: bitsandbytes
|
||||
- local: quantization/gguf
|
||||
title: gguf
|
||||
- local: quantization/torchao
|
||||
title: torchao
|
||||
- local: quantization/quanto
|
||||
title: quanto
|
||||
|
||||
- title: Model accelerators and hardware
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
- local: optimization/onnx
|
||||
title: ONNX
|
||||
- local: optimization/open_vino
|
||||
title: OpenVINO
|
||||
- local: optimization/coreml
|
||||
title: Core ML
|
||||
- local: optimization/mps
|
||||
title: Metal Performance Shaders (MPS)
|
||||
- local: optimization/habana
|
||||
title: Intel Gaudi
|
||||
- local: optimization/neuron
|
||||
title: AWS Neuron
|
||||
|
||||
- title: Specific pipeline examples
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: using-diffusers/consisid
|
||||
title: ConsisID
|
||||
- local: using-diffusers/sdxl
|
||||
@@ -138,106 +234,30 @@
|
||||
title: Stable Video Diffusion
|
||||
- local: using-diffusers/marigold_usage
|
||||
title: Marigold Computer Vision
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- isExpanded: false
|
||||
|
||||
- title: Resources
|
||||
isExpanded: false
|
||||
sections:
|
||||
- title: Task recipes
|
||||
sections:
|
||||
- local: training/unconditional_training
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: training/cogvideox
|
||||
title: CogVideoX
|
||||
title: Models
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
title: Training
|
||||
- sections:
|
||||
- local: quantization/overview
|
||||
title: Getting Started
|
||||
- local: quantization/bitsandbytes
|
||||
title: bitsandbytes
|
||||
- local: quantization/gguf
|
||||
title: gguf
|
||||
- local: quantization/torchao
|
||||
title: torchao
|
||||
- local: quantization/quanto
|
||||
title: quanto
|
||||
title: Quantization Methods
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Accelerate inference
|
||||
- local: optimization/cache
|
||||
title: Caching
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/speed-memory-optims
|
||||
title: Compile and offloading quantized models
|
||||
- local: optimization/pruna
|
||||
title: Pruna
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
- local: optimization/tgate
|
||||
title: TGATE
|
||||
- local: optimization/xdit
|
||||
title: xDiT
|
||||
- local: optimization/para_attn
|
||||
title: ParaAttention
|
||||
- sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
- local: optimization/onnx
|
||||
title: ONNX
|
||||
- local: optimization/open_vino
|
||||
title: OpenVINO
|
||||
- local: optimization/coreml
|
||||
title: Core ML
|
||||
title: Optimized model formats
|
||||
- sections:
|
||||
- local: optimization/mps
|
||||
title: Metal Performance Shaders (MPS)
|
||||
- local: optimization/habana
|
||||
title: Intel Gaudi
|
||||
- local: optimization/neuron
|
||||
title: AWS Neuron
|
||||
title: Optimized hardware
|
||||
title: Accelerate inference and reduce memory
|
||||
- sections:
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: advanced_inference/outpaint
|
||||
title: Outpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Video generation
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
- local: using-diffusers/write_own_pipeline
|
||||
title: Understanding pipelines, models and schedulers
|
||||
- local: community_projects
|
||||
title: Projects built with Diffusers
|
||||
- local: conceptual/philosophy
|
||||
title: Philosophy
|
||||
- local: using-diffusers/controlling_generation
|
||||
@@ -248,13 +268,11 @@
|
||||
title: Diffusers' Ethical Guidelines
|
||||
- local: conceptual/evaluation
|
||||
title: Evaluating Diffusion Models
|
||||
title: Conceptual Guides
|
||||
- sections:
|
||||
- local: community_projects
|
||||
title: Projects built with Diffusers
|
||||
title: Community Projects
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
|
||||
- title: API
|
||||
isExpanded: false
|
||||
sections:
|
||||
- title: Main Classes
|
||||
sections:
|
||||
- local: api/configuration
|
||||
title: Configuration
|
||||
@@ -264,8 +282,7 @@
|
||||
title: Outputs
|
||||
- local: api/quantization
|
||||
title: Quantization
|
||||
title: Main Classes
|
||||
- isExpanded: false
|
||||
- title: Loaders
|
||||
sections:
|
||||
- local: api/loaders/ip_adapter
|
||||
title: IP-Adapter
|
||||
@@ -281,14 +298,14 @@
|
||||
title: SD3Transformer2D
|
||||
- local: api/loaders/peft
|
||||
title: PEFT
|
||||
title: Loaders
|
||||
- isExpanded: false
|
||||
- title: Models
|
||||
sections:
|
||||
- local: api/models/overview
|
||||
title: Overview
|
||||
- local: api/models/auto_model
|
||||
title: AutoModel
|
||||
- sections:
|
||||
- title: ControlNets
|
||||
sections:
|
||||
- local: api/models/controlnet
|
||||
title: ControlNetModel
|
||||
- local: api/models/controlnet_union
|
||||
@@ -303,8 +320,8 @@
|
||||
title: SD3ControlNetModel
|
||||
- local: api/models/controlnet_sparsectrl
|
||||
title: SparseControlNetModel
|
||||
title: ControlNets
|
||||
- sections:
|
||||
- title: Transformers
|
||||
sections:
|
||||
- local: api/models/allegro_transformer3d
|
||||
title: AllegroTransformer3DModel
|
||||
- local: api/models/aura_flow_transformer2d
|
||||
@@ -349,10 +366,14 @@
|
||||
title: PixArtTransformer2DModel
|
||||
- local: api/models/prior_transformer
|
||||
title: PriorTransformer
|
||||
- local: api/models/qwenimage_transformer2d
|
||||
title: QwenImageTransformer2DModel
|
||||
- local: api/models/sana_transformer2d
|
||||
title: SanaTransformer2DModel
|
||||
- local: api/models/sd3_transformer2d
|
||||
title: SD3Transformer2DModel
|
||||
- local: api/models/skyreels_v2_transformer_3d
|
||||
title: SkyReelsV2Transformer3DModel
|
||||
- local: api/models/stable_audio_transformer
|
||||
title: StableAudioDiTModel
|
||||
- local: api/models/transformer2d
|
||||
@@ -361,8 +382,8 @@
|
||||
title: TransformerTemporalModel
|
||||
- local: api/models/wan_transformer_3d
|
||||
title: WanTransformer3DModel
|
||||
title: Transformers
|
||||
- sections:
|
||||
- title: UNets
|
||||
sections:
|
||||
- local: api/models/stable_cascade_unet
|
||||
title: StableCascadeUNet
|
||||
- local: api/models/unet
|
||||
@@ -377,8 +398,8 @@
|
||||
title: UNetMotionModel
|
||||
- local: api/models/uvit2d
|
||||
title: UViT2DModel
|
||||
title: UNets
|
||||
- sections:
|
||||
- title: VAEs
|
||||
sections:
|
||||
- local: api/models/asymmetricautoencoderkl
|
||||
title: AsymmetricAutoencoderKL
|
||||
- local: api/models/autoencoder_dc
|
||||
@@ -399,6 +420,8 @@
|
||||
title: AutoencoderKLMagvit
|
||||
- local: api/models/autoencoderkl_mochi
|
||||
title: AutoencoderKLMochi
|
||||
- local: api/models/autoencoderkl_qwenimage
|
||||
title: AutoencoderKLQwenImage
|
||||
- local: api/models/autoencoder_kl_wan
|
||||
title: AutoencoderKLWan
|
||||
- local: api/models/consistency_decoder_vae
|
||||
@@ -409,9 +432,7 @@
|
||||
title: Tiny AutoEncoder
|
||||
- local: api/models/vq
|
||||
title: VQModel
|
||||
title: VAEs
|
||||
title: Models
|
||||
- isExpanded: false
|
||||
- title: Pipelines
|
||||
sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
@@ -537,6 +558,8 @@
|
||||
title: PixArt-α
|
||||
- local: api/pipelines/pixart_sigma
|
||||
title: PixArt-Σ
|
||||
- local: api/pipelines/qwenimage
|
||||
title: QwenImage
|
||||
- local: api/pipelines/sana
|
||||
title: Sana
|
||||
- local: api/pipelines/sana_sprint
|
||||
@@ -547,11 +570,14 @@
|
||||
title: Semantic Guidance
|
||||
- local: api/pipelines/shap_e
|
||||
title: Shap-E
|
||||
- local: api/pipelines/skyreels_v2
|
||||
title: SkyReels-V2
|
||||
- local: api/pipelines/stable_audio
|
||||
title: Stable Audio
|
||||
- local: api/pipelines/stable_cascade
|
||||
title: Stable Cascade
|
||||
- sections:
|
||||
- title: Stable Diffusion
|
||||
sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/stable_diffusion/depth2img
|
||||
@@ -588,7 +614,6 @@
|
||||
title: T2I-Adapter
|
||||
- local: api/pipelines/stable_diffusion/text2img
|
||||
title: Text-to-image
|
||||
title: Stable Diffusion
|
||||
- local: api/pipelines/stable_unclip
|
||||
title: Stable unCLIP
|
||||
- local: api/pipelines/text_to_video
|
||||
@@ -607,8 +632,7 @@
|
||||
title: Wan
|
||||
- local: api/pipelines/wuerstchen
|
||||
title: Wuerstchen
|
||||
title: Pipelines
|
||||
- isExpanded: false
|
||||
- title: Schedulers
|
||||
sections:
|
||||
- local: api/schedulers/overview
|
||||
title: Overview
|
||||
@@ -678,8 +702,7 @@
|
||||
title: UniPCMultistepScheduler
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: VQDiffusionScheduler
|
||||
title: Schedulers
|
||||
- isExpanded: false
|
||||
- title: Internal classes
|
||||
sections:
|
||||
- local: api/internal_classes_overview
|
||||
title: Overview
|
||||
@@ -697,5 +720,3 @@
|
||||
title: VAE Image Processor
|
||||
- local: api/video_processor
|
||||
title: Video Processor
|
||||
title: Internal classes
|
||||
title: API
|
||||
|
||||
@@ -16,7 +16,7 @@ Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from
|
||||
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`.
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf auth login`.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -26,9 +26,11 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
|
||||
- [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
|
||||
- [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
|
||||
- [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
|
||||
- [`SkyReelsV2LoraLoaderMixin`] provides similar functions for [SkyReels-V2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/skyreels_v2).
|
||||
- [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
|
||||
- [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
|
||||
- [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
|
||||
- [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen)
|
||||
- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
|
||||
|
||||
<Tip>
|
||||
@@ -92,6 +94,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
|
||||
|
||||
## SkyReelsV2LoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.SkyReelsV2LoraLoaderMixin
|
||||
|
||||
## AmusedLoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
|
||||
@@ -100,6 +106,10 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin
|
||||
|
||||
## WanLoraLoaderMixin
|
||||
## QwenImageLoraLoaderMixin
|
||||
|
||||
[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
|
||||
[[autodoc]] loaders.lora_pipeline.QwenImageLoraLoaderMixin
|
||||
|
||||
## LoraBaseMixin
|
||||
|
||||
[[autodoc]] loaders.lora_base.LoraBaseMixin
|
||||
@@ -0,0 +1,35 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# AutoencoderKLQwenImage
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import AutoencoderKLQwenImage
|
||||
|
||||
vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage-20B", subfolder="vae")
|
||||
```
|
||||
|
||||
## AutoencoderKLQwenImage
|
||||
|
||||
[[autodoc]] AutoencoderKLQwenImage
|
||||
- decode
|
||||
- encode
|
||||
- all
|
||||
|
||||
## AutoencoderKLOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
|
||||
|
||||
## DecoderOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.vae.DecoderOutput
|
||||
@@ -0,0 +1,28 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# QwenImageTransformer2DModel
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import QwenImageTransformer2DModel
|
||||
|
||||
transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage-20B", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## QwenImageTransformer2DModel
|
||||
|
||||
[[autodoc]] QwenImageTransformer2DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -0,0 +1,30 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# SkyReelsV2Transformer3DModel
|
||||
|
||||
A Diffusion Transformer model for 3D video-like data was introduced in [SkyReels-V2](https://github.com/SkyworkAI/SkyReels-V2) by the Skywork AI.
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import SkyReelsV2Transformer3DModel
|
||||
|
||||
transformer = SkyReelsV2Transformer3DModel.from_pretrained("Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## SkyReelsV2Transformer3DModel
|
||||
|
||||
[[autodoc]] SkyReelsV2Transformer3DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -0,0 +1,92 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. -->
|
||||
|
||||
# QwenImage
|
||||
|
||||
Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
|
||||
|
||||
Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## LoRA for faster inference
|
||||
|
||||
Use a LoRA from `lightx2v/Qwen-Image-Lightning` to speed up inference by reducing the
|
||||
number of steps. Refer to the code snippet below:
|
||||
|
||||
<details>
|
||||
<summary>Code</summary>
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
|
||||
import torch
|
||||
import math
|
||||
|
||||
ckpt_id = "Qwen/Qwen-Image"
|
||||
|
||||
# From
|
||||
# https://github.com/ModelTC/Qwen-Image-Lightning/blob/342260e8f5468d2f24d084ce04f55e101007118b/generate_with_diffusers.py#L82C9-L97C10
|
||||
scheduler_config = {
|
||||
"base_image_seq_len": 256,
|
||||
"base_shift": math.log(3), # We use shift=3 in distillation
|
||||
"invert_sigmas": False,
|
||||
"max_image_seq_len": 8192,
|
||||
"max_shift": math.log(3), # We use shift=3 in distillation
|
||||
"num_train_timesteps": 1000,
|
||||
"shift": 1.0,
|
||||
"shift_terminal": None, # set shift_terminal to None
|
||||
"stochastic_sampling": False,
|
||||
"time_shift_type": "exponential",
|
||||
"use_beta_sigmas": False,
|
||||
"use_dynamic_shifting": True,
|
||||
"use_exponential_sigmas": False,
|
||||
"use_karras_sigmas": False,
|
||||
}
|
||||
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
pipe.load_lora_weights(
|
||||
"lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors"
|
||||
)
|
||||
|
||||
prompt = "a tiny astronaut hatching from an egg on the moon, Ultra HD, 4K, cinematic composition."
|
||||
negative_prompt = " "
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
width=1024,
|
||||
height=1024,
|
||||
num_inference_steps=8,
|
||||
true_cfg_scale=1.0,
|
||||
generator=torch.manual_seed(0),
|
||||
).images[0]
|
||||
image.save("qwen_fewsteps.png")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## QwenImagePipeline
|
||||
|
||||
[[autodoc]] QwenImagePipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImagePipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
|
||||
@@ -0,0 +1,367 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. -->
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<a href="https://huggingface.co/docs/diffusers/main/en/tutorials/using_peft_for_inference" target="_blank" rel="noopener">
|
||||
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# SkyReels-V2: Infinite-length Film Generative model
|
||||
|
||||
[SkyReels-V2](https://huggingface.co/papers/2504.13074) by the SkyReels Team.
|
||||
|
||||
*Recent advances in video generation have been driven by diffusion models and autoregressive frameworks, yet critical challenges persist in harmonizing prompt adherence, visual quality, motion dynamics, and duration: compromises in motion dynamics to enhance temporal visual quality, constrained video duration (5-10 seconds) to prioritize resolution, and inadequate shot-aware generation stemming from general-purpose MLLMs' inability to interpret cinematic grammar, such as shot composition, actor expressions, and camera motions. These intertwined limitations hinder realistic long-form synthesis and professional film-style generation. To address these limitations, we propose SkyReels-V2, an Infinite-length Film Generative Model, that synergizes Multi-modal Large Language Model (MLLM), Multi-stage Pretraining, Reinforcement Learning, and Diffusion Forcing Framework. Firstly, we design a comprehensive structural representation of video that combines the general descriptions by the Multi-modal LLM and the detailed shot language by sub-expert models. Aided with human annotation, we then train a unified Video Captioner, named SkyCaptioner-V1, to efficiently label the video data. Secondly, we establish progressive-resolution pretraining for the fundamental video generation, followed by a four-stage post-training enhancement: Initial concept-balanced Supervised Fine-Tuning (SFT) improves baseline quality; Motion-specific Reinforcement Learning (RL) training with human-annotated and synthetic distortion data addresses dynamic artifacts; Our diffusion forcing framework with non-decreasing noise schedules enables long-video synthesis in an efficient search space; Final high-quality SFT refines visual fidelity. All the code and models are available at [this https URL](https://github.com/SkyworkAI/SkyReels-V2).*
|
||||
|
||||
You can find all the original SkyReels-V2 checkpoints under the [Skywork](https://huggingface.co/collections/Skywork/skyreels-v2-6801b1b93df627d441d0d0d9) organization.
|
||||
|
||||
The following SkyReels-V2 models are supported in Diffusers:
|
||||
- [SkyReels-V2 DF 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers)
|
||||
- [SkyReels-V2 DF 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-540P-Diffusers)
|
||||
- [SkyReels-V2 DF 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-720P-Diffusers)
|
||||
- [SkyReels-V2 T2V 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-T2V-14B-540P-Diffusers)
|
||||
- [SkyReels-V2 T2V 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-T2V-14B-720P-Diffusers)
|
||||
- [SkyReels-V2 I2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-1.3B-540P-Diffusers)
|
||||
- [SkyReels-V2 I2V 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-540P-Diffusers)
|
||||
- [SkyReels-V2 I2V 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P-Diffusers)
|
||||
- [SkyReels-V2 FLF2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-FLF2V-1.3B-540P-Diffusers)
|
||||
|
||||
> [!TIP]
|
||||
> Click on the SkyReels-V2 models in the right sidebar for more examples of video generation.
|
||||
|
||||
### A _Visual_ Demonstration
|
||||
|
||||
An example with these parameters:
|
||||
base_num_frames=97, num_frames=97, num_inference_steps=30, ar_step=5, causal_block_size=5
|
||||
|
||||
vae_scale_factor_temporal -> 4
|
||||
num_latent_frames: (97-1)//vae_scale_factor_temporal+1 = 25 frames -> 5 blocks of 5 frames each
|
||||
|
||||
base_num_latent_frames = (97-1)//vae_scale_factor_temporal+1 = 25 → blocks = 25//5 = 5 blocks
|
||||
This 5 blocks means the maximum context length of the model is 25 frames in the latent space.
|
||||
|
||||
Asynchronous Processing Timeline:
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Steps: 1 6 11 16 21 26 31 36 41 46 50 │
|
||||
│ Block 1: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 2: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 3: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 4: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 5: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
For Long Videos (num_frames > base_num_frames):
|
||||
base_num_frames acts as the "sliding window size" for processing long videos.
|
||||
|
||||
Example: 257-frame video with base_num_frames=97, overlap_history=17
|
||||
┌──── Iteration 1 (frames 1-97) ────┐
|
||||
│ Processing window: 97 frames │ → 5 blocks, async processing
|
||||
│ Generates: frames 1-97 │
|
||||
└───────────────────────────────────┘
|
||||
┌────── Iteration 2 (frames 81-177) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (81-97) from prev │ → 5 blocks, async processing
|
||||
│ Generates: frames 98-177 │
|
||||
└─────────────────────────────────────────┘
|
||||
┌────── Iteration 3 (frames 161-257) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (161-177) from prev │ → 5 blocks, async processing
|
||||
│ Generates: frames 178-257 │
|
||||
└──────────────────────────────────────────┘
|
||||
|
||||
Each iteration independently runs the asynchronous processing with its own 5 blocks.
|
||||
base_num_frames controls:
|
||||
1. Memory usage (larger window = more VRAM)
|
||||
2. Model context length (must match training constraints)
|
||||
3. Number of blocks per iteration (base_num_latent_frames // causal_block_size)
|
||||
|
||||
Each block takes 30 steps to complete denoising.
|
||||
Block N starts at step: 1 + (N-1) x ar_step
|
||||
Total steps: 30 + (5-1) x 5 = 50 steps
|
||||
|
||||
|
||||
Synchronous mode (ar_step=0) would process all blocks/frames simultaneously:
|
||||
┌──────────────────────────────────────────────┐
|
||||
│ Steps: 1 ... 30 │
|
||||
│ All blocks: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└──────────────────────────────────────────────┘
|
||||
Total steps: 30 steps
|
||||
|
||||
|
||||
An example on how the step matrix is constructed for asynchronous processing:
|
||||
Given the parameters: (num_inference_steps=30, flow_shift=8, num_frames=97, ar_step=5, causal_block_size=5)
|
||||
- num_latent_frames = (97 frames - 1) // (4 temporal downsampling) + 1 = 25
|
||||
- step_template = [999, 995, 991, 986, 980, 975, 969, 963, 956, 948,
|
||||
941, 932, 922, 912, 901, 888, 874, 859, 841, 822,
|
||||
799, 773, 743, 708, 666, 615, 551, 470, 363, 216]
|
||||
|
||||
The algorithm creates a 50x25 step_matrix where:
|
||||
- Row 1: [999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- Row 2: [995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- Row 3: [991, 991, 991, 991, 991, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 7: [969, 969, 969, 969, 969, 995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 21: [799, 799, 799, 799, 799, 888, 888, 888, 888, 888, 941, 941, 941, 941, 941, 975, 975, 975, 975, 975, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 35: [ 0, 0, 0, 0, 0, 216, 216, 216, 216, 216, 666, 666, 666, 666, 666, 822, 822, 822, 822, 822, 901, 901, 901, 901, 901]
|
||||
- ...
|
||||
- Row 42: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 551, 551, 551, 551, 551, 773, 773, 773, 773, 773]
|
||||
- ...
|
||||
- Row 50: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 216, 216, 216, 216, 216]
|
||||
|
||||
Detailed Row 6 Analysis:
|
||||
- step_matrix[5]: [ 975, 975, 975, 975, 975, 999, 999, 999, 999, 999, 999, ..., 999]
|
||||
- step_index[5]: [ 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 0, ..., 0]
|
||||
- step_update_mask[5]: [True,True,True,True,True,True,True,True,True,True,False, ...,False]
|
||||
- valid_interval[5]: (0, 25)
|
||||
|
||||
Key Pattern: Block i lags behind Block i-1 by exactly ar_step=5 timesteps, creating the
|
||||
staggered "diffusion forcing" effect where later blocks condition on cleaner earlier blocks.
|
||||
|
||||
### Text-to-Video Generation
|
||||
|
||||
The example below demonstrates how to generate a video from text.
|
||||
|
||||
<hfoptions id="T2V usage">
|
||||
<hfoption id="T2V memory">
|
||||
|
||||
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
|
||||
|
||||
From the original repo:
|
||||
>You can use --ar_step 5 to enable asynchronous inference. When asynchronous inference, --causal_block_size 5 is recommended while it is not supposed to be set for synchronous generation... Asynchronous inference will take more steps to diffuse the whole sequence which means it will be SLOWER than synchronous mode. In our experiments, asynchronous inference may improve the instruction following and visual consistent performance.
|
||||
|
||||
```py
|
||||
# pip install ftfy
|
||||
import torch
|
||||
from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline, UniPCMultistepScheduler
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
vae = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32)
|
||||
transformer = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
|
||||
pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-14B-540P-Diffusers",
|
||||
vae=vae,
|
||||
transformer=transformer,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
flow_shift = 8.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline = pipeline.to("cuda")
|
||||
|
||||
prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
|
||||
|
||||
output = pipeline(
|
||||
prompt=prompt,
|
||||
num_inference_steps=30,
|
||||
height=544, # 720 for 720P
|
||||
width=960, # 1280 for 720P
|
||||
num_frames=97,
|
||||
base_num_frames=97, # 121 for 720P
|
||||
ar_step=5, # Controls asynchronous inference (0 for synchronous mode)
|
||||
causal_block_size=5, # Number of frames in each block for asynchronous processing
|
||||
overlap_history=None, # Number of frames to overlap for smooth transitions in long videos; 17 for long video generations
|
||||
addnoise_condition=20, # Improves consistency in long video generation
|
||||
).frames[0]
|
||||
export_to_video(output, "T2V.mp4", fps=24, quality=8)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### First-Last-Frame-to-Video Generation
|
||||
|
||||
The example below demonstrates how to use the image-to-video pipeline to generate a video using a text description, a starting frame, and an ending frame.
|
||||
|
||||
<hfoptions id="FLF2V usage">
|
||||
<hfoption id="usage">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision.transforms.functional as TF
|
||||
from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingImageToVideoPipeline, UniPCMultistepScheduler
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
|
||||
|
||||
model_id = "Skywork/SkyReels-V2-DF-14B-720P-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipeline = SkyReelsV2DiffusionForcingImageToVideoPipeline.from_pretrained(
|
||||
model_id, vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline.to("cuda")
|
||||
|
||||
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
|
||||
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
|
||||
|
||||
def aspect_ratio_resize(image, pipeline, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipeline.vae_scale_factor_spatial * pipeline.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
def center_crop_resize(image, height, width):
|
||||
# Calculate resize ratio to match first frame dimensions
|
||||
resize_ratio = max(width / image.width, height / image.height)
|
||||
|
||||
# Resize the image
|
||||
width = round(image.width * resize_ratio)
|
||||
height = round(image.height * resize_ratio)
|
||||
size = [width, height]
|
||||
image = TF.center_crop(image, size)
|
||||
|
||||
return image, height, width
|
||||
|
||||
first_frame, height, width = aspect_ratio_resize(first_frame, pipeline)
|
||||
if last_frame.size != first_frame.size:
|
||||
last_frame, _, _ = center_crop_resize(last_frame, height, width)
|
||||
|
||||
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
|
||||
|
||||
output = pipeline(
|
||||
image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.0
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24, quality=8)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
### Video-to-Video Generation
|
||||
|
||||
<hfoptions id="V2V usage">
|
||||
<hfoption id="usage">
|
||||
|
||||
`SkyReelsV2DiffusionForcingVideoToVideoPipeline` extends a given video.
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision.transforms.functional as TF
|
||||
from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingVideoToVideoPipeline, UniPCMultistepScheduler
|
||||
from diffusers.utils import export_to_video, load_video
|
||||
|
||||
|
||||
model_id = "Skywork/SkyReels-V2-DF-14B-540P-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipeline = SkyReelsV2DiffusionForcingVideoToVideoPipeline.from_pretrained(
|
||||
model_id, vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline.to("cuda")
|
||||
|
||||
video = load_video("input_video.mp4")
|
||||
|
||||
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
|
||||
|
||||
output = pipeline(
|
||||
video=video, prompt=prompt, height=544, width=960, guidance_scale=5.0,
|
||||
num_inference_steps=30, num_frames=257, base_num_frames=97#, ar_step=5, causal_block_size=5,
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24, quality=8)
|
||||
# Total frames will be the number of frames of given video + 257
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- SkyReels-V2 supports LoRAs with [`~loaders.SkyReelsV2LoraLoaderMixin.load_lora_weights`].
|
||||
|
||||
<details>
|
||||
<summary>Show example code</summary>
|
||||
|
||||
```py
|
||||
# pip install ftfy
|
||||
import torch
|
||||
from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
vae = AutoModel.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32
|
||||
)
|
||||
pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
|
||||
pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie")
|
||||
pipeline.set_adapters("steamboat-willie")
|
||||
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
# use "steamboat willie style" to trigger the LoRA
|
||||
prompt = """
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
|
||||
output = pipeline(
|
||||
prompt=prompt,
|
||||
num_frames=97,
|
||||
guidance_scale=6.0,
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
## SkyReelsV2DiffusionForcingPipeline
|
||||
|
||||
[[autodoc]] SkyReelsV2DiffusionForcingPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SkyReelsV2DiffusionForcingImageToVideoPipeline
|
||||
|
||||
[[autodoc]] SkyReelsV2DiffusionForcingImageToVideoPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SkyReelsV2DiffusionForcingVideoToVideoPipeline
|
||||
|
||||
[[autodoc]] SkyReelsV2DiffusionForcingVideoToVideoPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SkyReelsV2Pipeline
|
||||
|
||||
[[autodoc]] SkyReelsV2Pipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SkyReelsV2ImageToVideoPipeline
|
||||
|
||||
[[autodoc]] SkyReelsV2ImageToVideoPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SkyReelsV2PipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.skyreels_v2.pipeline_output.SkyReelsV2PipelineOutput
|
||||
@@ -31,7 +31,7 @@ _As the model is gated, before using it with diffusers you first need to go to t
|
||||
Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.
|
||||
|
||||
The following Wan models are supported in Diffusers:
|
||||
|
||||
- [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
|
||||
- [Wan 2.1 T2V 14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers)
|
||||
- [Wan 2.1 I2V 14B - 480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers)
|
||||
@@ -36,6 +37,9 @@ The following Wan models are supported in Diffusers:
|
||||
- [Wan 2.1 FLF2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers)
|
||||
- [Wan 2.1 VACE 1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers)
|
||||
- [Wan 2.1 VACE 14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers)
|
||||
- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
|
||||
- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
|
||||
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
|
||||
|
||||
> [!TIP]
|
||||
> Click on the Wan2.1 models in the right sidebar for more examples of video generation.
|
||||
@@ -327,6 +331,8 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
|
||||
|
||||
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
|
||||
|
||||
- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involed. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more.
|
||||
|
||||
## WanPipeline
|
||||
|
||||
[[autodoc]] WanPipeline
|
||||
|
||||
@@ -27,19 +27,19 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
|
||||
|
||||
## BitsAndBytesConfig
|
||||
|
||||
[[autodoc]] BitsAndBytesConfig
|
||||
[[autodoc]] quantizers.quantization_config.BitsAndBytesConfig
|
||||
|
||||
## GGUFQuantizationConfig
|
||||
|
||||
[[autodoc]] GGUFQuantizationConfig
|
||||
[[autodoc]] quantizers.quantization_config.GGUFQuantizationConfig
|
||||
|
||||
## QuantoConfig
|
||||
|
||||
[[autodoc]] QuantoConfig
|
||||
[[autodoc]] quantizers.quantization_config.QuantoConfig
|
||||
|
||||
## TorchAoConfig
|
||||
|
||||
[[autodoc]] TorchAoConfig
|
||||
[[autodoc]] quantizers.quantization_config.TorchAoConfig
|
||||
|
||||
## DiffusersQuantizer
|
||||
|
||||
|
||||
+13
-26
@@ -12,37 +12,24 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400"/>
|
||||
<img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400" style="border: none;"/>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
# Diffusers
|
||||
|
||||
🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or want to train your own diffusion model, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](conceptual/philosophy#usability-over-performance), [simple over easy](conceptual/philosophy#simple-over-easy), and [customizability over abstractions](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
|
||||
Diffusers is a library of state-of-the-art pretrained diffusion models for generating videos, images, and audio.
|
||||
|
||||
The library has three main components:
|
||||
The library revolves around the [`DiffusionPipeline`], an API designed for:
|
||||
|
||||
- State-of-the-art diffusion pipelines for inference with just a few lines of code. There are many pipelines in 🤗 Diffusers, check out the table in the pipeline [overview](api/pipelines/overview) for a complete list of available pipelines and the task they solve.
|
||||
- Interchangeable [noise schedulers](api/schedulers/overview) for balancing trade-offs between generation speed and quality.
|
||||
- Pretrained [models](api/models) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
|
||||
- easy inference with only a few lines of code
|
||||
- flexibility to mix-and-match pipeline components (models, schedulers)
|
||||
- loading and using adapters like LoRA
|
||||
|
||||
<div class="mt-10">
|
||||
<div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./tutorials/tutorial_overview"
|
||||
><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
|
||||
<p class="text-gray-700">Learn the fundamental skills you need to start generating outputs, build your own diffusion system, and train a diffusion model. We recommend starting here if you're using 🤗 Diffusers for the first time!</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./using-diffusers/loading_overview"
|
||||
><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
|
||||
<p class="text-gray-700">Practical guides for helping you load pipelines, models, and schedulers. You'll also learn how to use pipelines for specific tasks, control how outputs are generated, optimize for inference speed, and different training techniques.</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual/philosophy"
|
||||
><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
|
||||
<p class="text-gray-700">Understand why the library was designed the way it was, and learn more about the ethical guidelines and safety implementations for using the library.</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./api/models/overview"
|
||||
><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
|
||||
<p class="text-gray-700">Technical descriptions of how 🤗 Diffusers classes and methods work.</p>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
Diffusers also comes with optimizations - such as offloading and quantization - to ensure even the largest models are accessible on memory-constrained devices. If memory is not an issue, Diffusers supports torch.compile to boost inference speed.
|
||||
|
||||
Get started right away with a Diffusers model on the [Hub](https://huggingface.co/models?library=diffusers&sort=trending) today!
|
||||
|
||||
## Learn
|
||||
|
||||
If you're a beginner, we recommend starting with the [Hugging Face Diffusion Models Course](https://huggingface.co/learn/diffusion-course/unit0/1). You'll learn the theory behind diffusion models, and learn how to use the Diffusers library to generate images, fine-tune your own models, and more.
|
||||
|
||||
+75
-102
@@ -12,183 +12,156 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Installation
|
||||
|
||||
🤗 Diffusers is tested on Python 3.8+, PyTorch 1.7.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
|
||||
Diffusers is tested on Python 3.8+, PyTorch 1.4+, and Flax 0.4.1+. Follow the installation instructions for the deep learning library you're using, [PyTorch](https://pytorch.org/get-started/locally/) or [Flax](https://flax.readthedocs.io/en/latest/).
|
||||
|
||||
- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions
|
||||
- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions
|
||||
|
||||
## Install with pip
|
||||
|
||||
You should install 🤗 Diffusers in a [virtual environment](https://docs.python.org/3/library/venv.html).
|
||||
If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
A virtual environment makes it easier to manage different projects and avoid compatibility issues between dependencies.
|
||||
|
||||
Create a virtual environment with Python or [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
|
||||
|
||||
<hfoptions id="install">
|
||||
<hfoption id="uv">
|
||||
Create a [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) for easier management of separate projects and to avoid compatibility issues between dependencies. Use [uv](https://docs.astral.sh/uv/), a Rust-based Python package and project manager, to create a virtual environment and install Diffusers.
|
||||
|
||||
```bash
|
||||
uv venv my-env
|
||||
source my-env/bin/activate
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Python">
|
||||
Install Diffusers with one of the following methods.
|
||||
|
||||
<hfoptions id="install">
|
||||
<hfoption id="pip">
|
||||
|
||||
PyTorch only supports Python 3.8 - 3.11 on Windows.
|
||||
|
||||
```bash
|
||||
python -m venv my-env
|
||||
source my-env/bin/activate
|
||||
uv pip install diffusers["torch"] transformers
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
You should also install 🤗 Transformers because 🤗 Diffusers relies on its models.
|
||||
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
PyTorch only supports Python 3.8 - 3.11 on Windows. Install Diffusers with uv.
|
||||
|
||||
```bash
|
||||
uv install diffusers["torch"] transformers
|
||||
```
|
||||
|
||||
You can also install Diffusers with pip.
|
||||
|
||||
```bash
|
||||
pip install diffusers["torch"] transformers
|
||||
```
|
||||
|
||||
</pt>
|
||||
<jax>
|
||||
|
||||
Install Diffusers with uv.
|
||||
Use the command below for Flax.
|
||||
|
||||
```bash
|
||||
uv pip install diffusers["flax"] transformers
|
||||
```
|
||||
|
||||
You can also install Diffusers with pip.
|
||||
|
||||
```bash
|
||||
pip install diffusers["flax"] transformers
|
||||
```
|
||||
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Install with conda
|
||||
|
||||
After activating your virtual environment, with `conda` (maintained by the community):
|
||||
</hfoption>
|
||||
<hfoption id="conda">
|
||||
|
||||
```bash
|
||||
conda install -c conda-forge diffusers
|
||||
```
|
||||
|
||||
## Install from source
|
||||
</hfoption>
|
||||
<hfoption id="source">
|
||||
|
||||
Before installing 🤗 Diffusers from source, make sure you have PyTorch and 🤗 Accelerate installed.
|
||||
A source install installs the `main` version instead of the latest `stable` version. The `main` version is useful for staying updated with the latest changes but it may not always be stable. If you run into a problem, open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and we will try to resolve it as soon as possible.
|
||||
|
||||
To install 🤗 Accelerate:
|
||||
Make sure [Accelerate](https://huggingface.co/docs/accelerate/index) is installed.
|
||||
|
||||
```bash
|
||||
pip install accelerate
|
||||
uv pip install accelerate
|
||||
```
|
||||
|
||||
Then install 🤗 Diffusers from source:
|
||||
Install Diffusers from source with the command below.
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers
|
||||
uv pip install git+https://github.com/huggingface/diffusers
|
||||
```
|
||||
|
||||
This command installs the bleeding edge `main` version rather than the latest `stable` version.
|
||||
The `main` version is useful for staying up-to-date with the latest developments.
|
||||
For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
|
||||
However, this means the `main` version may not always be stable.
|
||||
We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
|
||||
If you run into a problem, please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can fix it even sooner!
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Editable install
|
||||
|
||||
You will need an editable install if you'd like to:
|
||||
An editable install is recommended for development workflows or if you're using the `main` version of the source code. A special link is created between the cloned repository and the Python library paths. This avoids reinstalling a package after every change.
|
||||
|
||||
* Use the `main` version of the source code.
|
||||
* Contribute to 🤗 Diffusers and need to test changes in the code.
|
||||
Clone the repository and install Diffusers with the following commands.
|
||||
|
||||
Clone the repository and install 🤗 Diffusers with the following commands:
|
||||
<hfoptions id="editable">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers.git
|
||||
cd diffusers
|
||||
uv pip install -e ".[torch]"
|
||||
```
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
pip install -e ".[torch]"
|
||||
git clone https://github.com/huggingface/diffusers.git
|
||||
cd diffusers
|
||||
uv pip install -e ".[flax]"
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```bash
|
||||
pip install -e ".[flax]"
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
These commands will link the folder you cloned the repository to and your Python library paths.
|
||||
Python will now look inside the folder you cloned to in addition to the normal library paths.
|
||||
For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.10/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip warning={true}>
|
||||
> [!WARNING]
|
||||
> You must keep the `diffusers` folder if you want to keep using the library with the editable install.
|
||||
|
||||
You must keep the `diffusers` folder if you want to keep using the library.
|
||||
|
||||
</Tip>
|
||||
|
||||
Now you can easily update your clone to the latest version of 🤗 Diffusers with the following command:
|
||||
Update your cloned repository to the latest version of Diffusers with the command below.
|
||||
|
||||
```bash
|
||||
cd ~/diffusers/
|
||||
git pull
|
||||
```
|
||||
|
||||
Your Python environment will find the `main` version of 🤗 Diffusers on the next run.
|
||||
|
||||
## Cache
|
||||
|
||||
Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
|
||||
Model weights and files are downloaded from the Hub to a cache, which is usually your home directory. Change the cache location with the [HF_HOME](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome) or [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache) environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
|
||||
<hfoptions id="cache">
|
||||
<hfoption id="env variable">
|
||||
|
||||
```bash
|
||||
export HF_HOME="/path/to/your/cache"
|
||||
export HF_HUB_CACHE="/path/to/your/hub/cache"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="from_pretrained">
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
cache_dir="/path/to/your/cache"
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Cached files allow you to use Diffusers offline. Set the [HF_HUB_OFFLINE](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhuboffline) environment variable to `1` to prevent Diffusers from connecting to the internet.
|
||||
|
||||
```shell
|
||||
export HF_HUB_OFFLINE=1
|
||||
```
|
||||
|
||||
For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
|
||||
For more details about managing and cleaning the cache, take a look at the [Understand caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
|
||||
|
||||
## Telemetry logging
|
||||
|
||||
Our library gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
|
||||
The data gathered includes the version of 🤗 Diffusers and PyTorch/Flax, the requested model or pipeline class,
|
||||
and the path to a pretrained checkpoint if it is hosted on the Hugging Face Hub.
|
||||
Diffusers gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
|
||||
The data gathered includes the Diffusers and PyTorch/Flax version, the requested model or pipeline class,
|
||||
and the path to a pretrained checkpoint if it is hosted on the Hub.
|
||||
|
||||
This usage data helps us debug issues and prioritize new features.
|
||||
Telemetry is only sent when loading models and pipelines from the Hub,
|
||||
and it is not collected if you're loading local files.
|
||||
|
||||
We understand that not everyone wants to share additional information,and we respect your privacy.
|
||||
You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
|
||||
Opt-out and disable telemetry collection with the [HF_HUB_DISABLE_TELEMETRY](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhubdisabletelemetry) environment variable.
|
||||
|
||||
On Linux/MacOS:
|
||||
<hfoptions id="telemetry">
|
||||
<hfoption id="Linux/macOS">
|
||||
|
||||
```bash
|
||||
export HF_HUB_DISABLE_TELEMETRY=1
|
||||
```
|
||||
|
||||
On Windows:
|
||||
</hfoption>
|
||||
<hfoption id="Windows">
|
||||
|
||||
```bash
|
||||
set HF_HUB_DISABLE_TELEMETRY=1
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
@@ -239,6 +239,12 @@ The `step()` function is [called](https://github.com/huggingface/diffusers/blob/
|
||||
|
||||
In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
|
||||
|
||||
<Tip>
|
||||
|
||||
Refer to the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post for maximizing performance with `torch.compile` for diffusion models.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Benchmarks
|
||||
|
||||
Refer to the [diffusers/benchmarks](https://huggingface.co/datasets/diffusers/benchmarks) dataset to see inference latency and memory usage data for compiled pipelines.
|
||||
@@ -298,4 +304,6 @@ pipeline.fuse_qkv_projections()
|
||||
|
||||
- Read the [Presenting Flux Fast: Making Flux go brrr on H100s](https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/) blog post to learn more about how you can combine all of these optimizations with [TorchInductor](https://docs.pytorch.org/docs/stable/torch.compiler.html) and [AOTInductor](https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html) for a ~2.5x speedup using recipes from [flux-fast](https://github.com/huggingface/flux-fast).
|
||||
|
||||
These recipes support AMD hardware and [Flux.1 Kontext Dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev).
|
||||
These recipes support AMD hardware and [Flux.1 Kontext Dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev).
|
||||
- Read the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post
|
||||
to maximize performance when using `torch.compile`.
|
||||
@@ -53,6 +53,16 @@ image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
|
||||
image.save("flux-gguf.png")
|
||||
```
|
||||
|
||||
## Using Optimized CUDA Kernels with GGUF
|
||||
|
||||
Optimized CUDA kernels can accelerate GGUF quantized model inference by approximately 10%. This functionality requires a compatible GPU with `torch.cuda.get_device_capability` greater than 7 and the kernels library:
|
||||
|
||||
```shell
|
||||
pip install -U kernels
|
||||
```
|
||||
|
||||
Once installed, set `DIFFUSERS_GGUF_CUDA_KERNELS=true` to use optimized kernels when available. Note that CUDA kernels may introduce minor numerical differences compared to the original GGUF implementation, potentially causing subtle visual variations in generated images. To disable CUDA kernel usage, set the environment variable `DIFFUSERS_GGUF_CUDA_KERNELS=false`.
|
||||
|
||||
## Supported Quantization Types
|
||||
|
||||
- BF16
|
||||
@@ -67,3 +77,44 @@ image.save("flux-gguf.png")
|
||||
- Q5_K
|
||||
- Q6_K
|
||||
|
||||
## Convert to GGUF
|
||||
|
||||
Use the Space below to convert a Diffusers checkpoint into the GGUF format for inference.
|
||||
run conversion:
|
||||
|
||||
<iframe
|
||||
src="https://diffusers-internal-dev-diffusers-to-gguf.hf.space"
|
||||
frameborder="0"
|
||||
width="850"
|
||||
height="450"
|
||||
></iframe>
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig
|
||||
|
||||
ckpt_path = (
|
||||
"https://huggingface.co/sayakpaul/different-lora-from-civitai/blob/main/flux_dev_diffusers-q4_0.gguf"
|
||||
)
|
||||
transformer = FluxTransformer2DModel.from_single_file(
|
||||
ckpt_path,
|
||||
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
|
||||
config="black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
pipe = FluxPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
transformer=transformer,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
prompt = "A cat holding a sign that says hello world"
|
||||
image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
|
||||
image.save("flux-gguf.png")
|
||||
```
|
||||
|
||||
When using Diffusers format GGUF checkpoints, it's a must to provide the model `config` path. If the
|
||||
model config resides in a `subfolder`, that needs to be specified, too.
|
||||
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
-->
|
||||
|
||||
# Quantization
|
||||
# Getting started
|
||||
|
||||
Quantization focuses on representing data with fewer bits while also trying to preserve the precision of the original data. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
|
||||
|
||||
@@ -19,19 +19,25 @@ Diffusers supports multiple quantization backends to make large diffusion models
|
||||
|
||||
## Pipeline-level quantization
|
||||
|
||||
There are two ways you can use [`~quantizers.PipelineQuantizationConfig`] depending on the level of control you want over the quantization specifications of each model in the pipeline.
|
||||
There are two ways to use [`~quantizers.PipelineQuantizationConfig`] depending on how much customization you want to apply to the quantization configuration.
|
||||
|
||||
- for more basic and simple use cases, you only need to define the `quant_backend`, `quant_kwargs`, and `components_to_quantize`
|
||||
- for more granular quantization control, provide a `quant_mapping` that provides the quantization specifications for the individual model components
|
||||
- for basic use cases, define the `quant_backend`, `quant_kwargs`, and `components_to_quantize` arguments
|
||||
- for granular quantization control, define a `quant_mapping` that provides the quantization configuration for individual model components
|
||||
|
||||
### Simple quantization
|
||||
### Basic quantization
|
||||
|
||||
Initialize [`~quantizers.PipelineQuantizationConfig`] with the following parameters.
|
||||
|
||||
- `quant_backend` specifies which quantization backend to use. Currently supported backends include: `bitsandbytes_4bit`, `bitsandbytes_8bit`, `gguf`, `quanto`, and `torchao`.
|
||||
- `quant_kwargs` contains the specific quantization arguments to use.
|
||||
- `quant_kwargs` specifies the quantization arguments to use.
|
||||
|
||||
> [!TIP]
|
||||
> These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
|
||||
|
||||
- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
|
||||
|
||||
The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -56,13 +62,13 @@ pipe = DiffusionPipeline.from_pretrained(
|
||||
image = pipe("photo of a cute dog").images[0]
|
||||
```
|
||||
|
||||
### quant_mapping
|
||||
### Advanced quantization
|
||||
|
||||
The `quant_mapping` argument provides more flexible options for how to quantize each individual component in a pipeline, like combining different quantization backends.
|
||||
The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.
|
||||
|
||||
Initialize [`~quantizers.PipelineQuantizationConfig`] and pass a `quant_mapping` to it. The `quant_mapping` allows you to specify the quantization options for each component in the pipeline such as the transformer and text encoder.
|
||||
|
||||
The example below uses two quantization backends, [`~quantizers.QuantoConfig`] and [`transformers.BitsAndBytesConfig`], for the transformer and text encoder.
|
||||
The example below uses two quantization backends, [`~quantizers.quantization_config.QuantoConfig`] and [`transformers.BitsAndBytesConfig`], for the transformer and text encoder.
|
||||
|
||||
```py
|
||||
import torch
|
||||
@@ -85,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
|
||||
There is a separate bitsandbytes backend in [Transformers](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig). You need to import and use [`transformers.BitsAndBytesConfig`] for components that come from Transformers. For example, `text_encoder_2` in [`FluxPipeline`] is a [`~transformers.T5EncoderModel`] from Transformers so you need to use [`transformers.BitsAndBytesConfig`] instead of [`diffusers.BitsAndBytesConfig`].
|
||||
|
||||
> [!TIP]
|
||||
> Use the [simple quantization](#simple-quantization) method above if you don't want to manage these distinct imports or aren't sure where each pipeline component comes from.
|
||||
> Use the [basic quantization](#basic-quantization) method above if you don't want to manage these distinct imports or aren't sure where each pipeline component comes from.
|
||||
|
||||
```py
|
||||
import torch
|
||||
@@ -129,4 +135,4 @@ Check out the resources below to learn more about quantization.
|
||||
|
||||
- The Transformers quantization [Overview](https://huggingface.co/docs/transformers/quantization/overview#when-to-use-what) provides an overview of the pros and cons of different quantization backends.
|
||||
|
||||
- Read the [Exploring Quantization Backends in Diffusers](https://huggingface.co/blog/diffusers-quantization) blog post for a brief introduction to each quantization backend, how to choose a backend, and combining quantization with other memory optimizations.
|
||||
- Read the [Exploring Quantization Backends in Diffusers](https://huggingface.co/blog/diffusers-quantization) blog post for a brief introduction to each quantization backend, how to choose a backend, and combining quantization with other memory optimizations.
|
||||
|
||||
@@ -145,10 +145,10 @@ When running `accelerate config`, if you use torch.compile, there can be dramati
|
||||
If you would like to push your model to the Hub after training is completed with a neat model card, make sure you're logged in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
|
||||
# Alternatively, you could upload your model manually using:
|
||||
# huggingface-cli upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
|
||||
# hf upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
|
||||
```
|
||||
|
||||
Make sure your data is prepared as described in [Data Preparation](#data-preparation). When ready, you can begin training!
|
||||
|
||||
@@ -67,7 +67,7 @@ dataset = load_dataset(
|
||||
Then use the [`~datasets.Dataset.push_to_hub`] method to upload the dataset to the Hub:
|
||||
|
||||
```python
|
||||
# assuming you have ran the huggingface-cli login command in a terminal
|
||||
# assuming you have ran the hf auth login command in a terminal
|
||||
dataset.push_to_hub("name_of_your_dataset")
|
||||
|
||||
# if you want to push to a private repo, simply pass private=True:
|
||||
|
||||
@@ -42,7 +42,7 @@ We encourage you to share your model with the community, and in order to do that
|
||||
Or login in from the terminal:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
Since the model checkpoints are quite large, install [Git-LFS](https://git-lfs.com/) to version these large files:
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
Welcome to 🧨 Diffusers! If you're new to diffusion models and generative AI, and want to learn more, then you've come to the right place. These beginner-friendly tutorials are designed to provide a gentle introduction to diffusion models and help you understand the library fundamentals - the core components and how 🧨 Diffusers is meant to be used.
|
||||
|
||||
You'll learn how to use a pipeline for inference to rapidly generate things, and then deconstruct that pipeline to really understand how to use the library as a modular toolbox for building your own diffusion systems. In the next lesson, you'll learn how to train your own diffusion model to generate what you want.
|
||||
|
||||
After completing the tutorials, you'll have gained the necessary skills to start exploring the library on your own and see how to use it for your own projects and applications.
|
||||
|
||||
Feel free to join our community on [Discord](https://discord.com/invite/JfAtkvEtRb) or the [forums](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) to connect and collaborate with other users and developers!
|
||||
|
||||
Let's start diffusing! 🧨
|
||||
@@ -319,6 +319,19 @@ If you expect to varied resolutions during inference with this feature, then mak
|
||||
|
||||
There are still scenarios where recompulation is unavoidable, such as when the hotswapped LoRA targets more layers than the initial adapter. Try to load the LoRA that targets the most layers *first*. For more details about this limitation, refer to the PEFT [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) docs.
|
||||
|
||||
<details>
|
||||
<summary>Technical details of hotswapping</summary>
|
||||
|
||||
The [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] method converts the LoRA scaling factor from floats to torch.tensors and pads the shape of the weights to the largest required shape to avoid reassigning the whole attribute when the data in the weights are replaced.
|
||||
|
||||
This is why the `max_rank` argument is important. The results are unchanged even when the values are padded with zeros. Computation may be slower though depending on the padding size.
|
||||
|
||||
Since no new LoRA attributes are added, each subsequent LoRA is only allowed to target the same layers, or subset of layers, the first LoRA targets. Choosing the LoRA loading order is important because if the LoRAs target disjoint layers, you may end up creating a dummy LoRA that targets the union of all target layers.
|
||||
|
||||
For more implementation details, take a look at the [`hotswap.py`](https://github.com/huggingface/peft/blob/92d65cafa51c829484ad3d95cf71d09de57ff066/src/peft/utils/hotswap.py) file.
|
||||
|
||||
</details>
|
||||
|
||||
## Merge
|
||||
|
||||
The weights from each LoRA can be merged together to produce a blend of multiple existing styles. There are several methods for merging LoRAs, each of which differ in *how* the weights are merged (may affect generation quality).
|
||||
@@ -673,4 +686,6 @@ Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to us
|
||||
height="450"
|
||||
></iframe>
|
||||
|
||||
You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
|
||||
You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
|
||||
|
||||
Check out the [Fast LoRA inference for Flux with Diffusers and PEFT](https://huggingface.co/blog/lora-fast) blog post to learn how to optimize LoRA inference with methods like FlashAttention-3 and fp8 quantization.
|
||||
@@ -1,18 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
The inference pipeline supports and enables a wide range of techniques that are divided into two categories:
|
||||
|
||||
* Pipeline functionality: these techniques modify the pipeline or extend it for other applications. For example, pipeline callbacks add new features to a pipeline and a pipeline can also be extended for distributed inference.
|
||||
* Improve inference quality: these techniques increase the visual quality of the generated images. For example, you can enhance your prompts with GPT2 to create better images with lower effort.
|
||||
@@ -37,7 +37,7 @@ Diffusers는 Stable Diffusion 추론을 위해 PyTorch `mps`를 사용해 Apple
|
||||
|
||||
|
||||
```python
|
||||
# `huggingface-cli login`에 로그인되어 있음을 확인
|
||||
# `hf auth login`에 로그인되어 있음을 확인
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
|
||||
@@ -75,7 +75,7 @@ dataset = load_dataset(
|
||||
[push_to_hub(https://huggingface.co/docs/datasets/v2.13.1/en/package_reference/main_classes#datasets.Dataset.push_to_hub) 을 사용해서 Hub에 데이터셋을 업로드 합니다:
|
||||
|
||||
```python
|
||||
# 터미널에서 huggingface-cli login 커맨드를 이미 실행했다고 가정합니다
|
||||
# 터미널에서 hf auth login 커맨드를 이미 실행했다고 가정합니다
|
||||
dataset.push_to_hub("name_of_your_dataset")
|
||||
|
||||
# 개인 repo로 push 하고 싶다면, `private=True` 을 추가하세요:
|
||||
|
||||
@@ -39,7 +39,7 @@ specific language governing permissions and limitations under the License.
|
||||
모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](https://huggingface.co/join)하세요):
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
@@ -42,7 +42,7 @@ Unconditional 이미지 생성은 학습에 사용된 데이터셋과 유사한
|
||||
또는 터미널로 로그인할 수 있습니다:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
모델 체크포인트가 상당히 크기 때문에 [Git-LFS](https://git-lfs.com/)에서 대용량 파일의 버전 관리를 할 수 있습니다.
|
||||
|
||||
@@ -42,7 +42,7 @@ Stable Diffusion 모델들은 학습 및 저장된 프레임워크와 다운로
|
||||
시작하기 전에 스크립트를 실행할 🤗 Diffusers의 로컬 클론(clone)이 있는지 확인하고 Hugging Face 계정에 로그인하여 pull request를 열고 변환된 모델을 허브에 푸시할 수 있도록 하세요.
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
스크립트를 사용하려면:
|
||||
|
||||
@@ -69,7 +69,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha
|
||||
|
||||
Lastly, we recommend logging into your HF account so that your trained LoRA is automatically uploaded to the hub:
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
This command will prompt you for a token. Copy-paste yours from your [settings/tokens](https://huggingface.co/settings/tokens),and press Enter.
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha
|
||||
|
||||
Lastly, we recommend logging into your HF account so that your trained LoRA is automatically uploaded to the hub:
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
This command will prompt you for a token. Copy-paste yours from your [settings/tokens](https://huggingface.co/settings/tokens),and press Enter.
|
||||
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=0.31.0",
|
||||
# "transformers>=4.41.2",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.11.1",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
@@ -971,6 +985,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
instance_data_root,
|
||||
instance_prompt,
|
||||
class_prompt,
|
||||
@@ -980,10 +995,8 @@ class DreamBoothDataset(Dataset):
|
||||
class_num=None,
|
||||
size=1024,
|
||||
repeats=1,
|
||||
center_crop=False,
|
||||
):
|
||||
self.size = size
|
||||
self.center_crop = center_crop
|
||||
|
||||
self.instance_prompt = instance_prompt
|
||||
self.custom_instance_prompts = None
|
||||
@@ -1058,7 +1071,7 @@ class DreamBoothDataset(Dataset):
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
train_resize = transforms.Resize(size, interpolation=interpolation)
|
||||
train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
|
||||
train_crop = transforms.CenterCrop(size) if args.center_crop else transforms.RandomCrop(size)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose(
|
||||
[
|
||||
@@ -1075,11 +1088,11 @@ class DreamBoothDataset(Dataset):
|
||||
# flip
|
||||
image = train_flip(image)
|
||||
if args.center_crop:
|
||||
y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
|
||||
x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
|
||||
y1 = max(0, int(round((image.height - self.size) / 2.0)))
|
||||
x1 = max(0, int(round((image.width - self.size) / 2.0)))
|
||||
image = train_crop(image)
|
||||
else:
|
||||
y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
|
||||
y1, x1, h, w = train_crop.get_params(image, (self.size, self.size))
|
||||
image = crop(image, y1, x1, h, w)
|
||||
image = train_transforms(image)
|
||||
self.pixel_values.append(image)
|
||||
@@ -1102,7 +1115,7 @@ class DreamBoothDataset(Dataset):
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.CenterCrop(size) if args.center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
@@ -1322,7 +1335,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
@@ -1827,6 +1840,7 @@ def main(args):
|
||||
|
||||
# Dataset and DataLoaders creation:
|
||||
train_dataset = DreamBoothDataset(
|
||||
args=args,
|
||||
instance_data_root=args.instance_data_dir,
|
||||
instance_prompt=args.instance_prompt,
|
||||
train_text_encoder_ti=args.train_text_encoder_ti,
|
||||
@@ -1836,7 +1850,6 @@ def main(args):
|
||||
class_num=args.num_class_images,
|
||||
size=args.resolution,
|
||||
repeats=args.repeats,
|
||||
center_crop=args.center_crop,
|
||||
)
|
||||
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=0.31.0",
|
||||
# "transformers>=4.41.2",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.11.1",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import hashlib
|
||||
@@ -1050,7 +1064,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=0.31.0",
|
||||
# "transformers>=4.41.2",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.11.1",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import itertools
|
||||
@@ -1292,7 +1306,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if args.do_edm_style_training and args.snr_gamma is not None:
|
||||
|
||||
@@ -125,10 +125,10 @@ When running `accelerate config`, if we specify torch compile mode to True there
|
||||
If you would like to push your model to the HF Hub after training is completed with a neat model card, make sure you're logged in:
|
||||
|
||||
```
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
|
||||
# Alternatively, you could upload your model manually using:
|
||||
# huggingface-cli upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
|
||||
# hf upload my-cool-account-name/my-cool-lora-name /path/to/awesome/lora
|
||||
```
|
||||
|
||||
Make sure your data is prepared as described in [Data Preparation](#data-preparation). When ready, you can begin training!
|
||||
|
||||
@@ -962,7 +962,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -984,7 +984,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -10,7 +10,7 @@ To incorporate additional condition latents, we expand the input features of Cog
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [CogView4 Hugging Face page](https://huggingface.co/THUDM/CogView4-6B), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
The example command below shows how to launch fine-tuning for pose conditions. The dataset ([`raulc0399/open_pose_controlnet`](https://huggingface.co/datasets/raulc0399/open_pose_controlnet)) being used here already has the pose conditions of the original images, so we don't have to compute them.
|
||||
|
||||
@@ -705,7 +705,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_out_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -3129,7 +3129,7 @@ from io import BytesIO
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# load the pipeline
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
# make sure you're logged in with `hf auth login`
|
||||
model_id_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
# can also be used with dreamlike-art/dreamlike-photoreal-2.0
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda")
|
||||
|
||||
@@ -877,7 +877,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -709,7 +709,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -872,7 +872,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -842,7 +842,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -882,7 +882,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -359,7 +359,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
|
||||
We encourage you to store or share your model with the community. To use huggingface hub, please login to your Hugging Face account, or ([create one](https://huggingface.co/docs/diffusers/main/en/training/hf.co/join) if you don’t have one already):
|
||||
|
||||
```sh
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
Make sure you have the `MODEL_DIR`,`OUTPUT_DIR` and `HUB_MODEL_ID` environment variables set. The `OUTPUT_DIR` and `HUB_MODEL_ID` variables specify where to save the model to on the Hub:
|
||||
|
||||
@@ -22,7 +22,7 @@ Here is a gpu memory consumption for reference, tested on a single A100 with 80G
|
||||
|
||||
> **Gated access**
|
||||
>
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in: `huggingface-cli login`
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in: `hf auth login`
|
||||
|
||||
|
||||
## Running locally with PyTorch
|
||||
@@ -88,7 +88,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
|
||||
```
|
||||
|
||||
Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained ControlNet parameters to Hugging Face Hub.
|
||||
Then run `hf auth login` to log into your Hugging Face account. This is needed to be able to push the trained ControlNet parameters to Hugging Face Hub.
|
||||
|
||||
we can define the num_layers, num_single_layers, which determines the size of the control(default values are num_layers=4, num_single_layers=10)
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stab
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
|
||||
|
||||
@@ -58,7 +58,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
|
||||
```
|
||||
|
||||
Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained ControlNet parameters to Hugging Face Hub.
|
||||
Then run `hf auth login` to log into your Hugging Face account. This is needed to be able to push the trained ControlNet parameters to Hugging Face Hub.
|
||||
|
||||
```bash
|
||||
export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
|
||||
|
||||
@@ -734,7 +734,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -665,7 +665,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
|
||||
@@ -814,7 +814,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_out_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -928,7 +928,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -829,7 +829,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -663,7 +663,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -330,7 +330,7 @@ For this example we want to directly store the trained LoRA embeddings on the Hu
|
||||
we need to be logged in and add the `--push_to_hub` flag.
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
Now we can start training!
|
||||
|
||||
@@ -19,7 +19,7 @@ The `train_dreambooth_flux.py` script shows how to implement the training proced
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
|
||||
|
||||
@@ -95,7 +95,7 @@ accelerate launch train_dreambooth_lora_hidream.py \
|
||||
For using `push_to_hub`, make you're logged into your Hugging Face account:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
@@ -101,7 +101,7 @@ accelerate launch train_dreambooth_lora_lumina2.py \
|
||||
For using `push_to_hub`, make you're logged into your Hugging Face account:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
# DreamBooth training example for Qwen Image
|
||||
|
||||
[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
|
||||
|
||||
The `train_dreambooth_lora_qwen_image.py` script shows how to implement the training procedure with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) and adapt it for [Qwen Image](https://huggingface.co/Qwen/Qwen-Image).
|
||||
|
||||
|
||||
This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
|
||||
|
||||
## Running locally with PyTorch
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
**Important**
|
||||
|
||||
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Then cd in the `examples/dreambooth` folder and run
|
||||
```bash
|
||||
pip install -r requirements_sana.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Or for a default accelerate configuration without answering questions about your environment
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell (e.g., a notebook)
|
||||
|
||||
```python
|
||||
from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
|
||||
Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.14.0` installed in your environment.
|
||||
|
||||
|
||||
### Dog toy example
|
||||
|
||||
Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
|
||||
|
||||
Let's first download it locally:
|
||||
|
||||
```python
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
local_dir = "./dog"
|
||||
snapshot_download(
|
||||
"diffusers/dog-example",
|
||||
local_dir=local_dir, repo_type="dataset",
|
||||
ignore_patterns=".gitattributes",
|
||||
)
|
||||
```
|
||||
|
||||
This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
|
||||
|
||||
Now, we can launch training using:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="Qwen/Qwen-Image"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="trained-sana-lora"
|
||||
|
||||
accelerate launch train_dreambooth_lora_sana.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--mixed_precision="bf16" \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=1024 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--use_8bit_adam \
|
||||
--learning_rate=2e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=500 \
|
||||
--validation_prompt="A photo of sks dog in a bucket" \
|
||||
--validation_epochs=25 \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
For using `push_to_hub`, make you're logged into your Hugging Face account:
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
|
||||
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
## Notes
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
|
||||
|
||||
We provide several options for optimizing memory optimization:
|
||||
|
||||
* `--offload`: When enabled, we will offload the text encoder and VAE to CPU, when they are not used.
|
||||
* `cache_latents`: When enabled, we will pre-compute the latents from the input images with the VAE and remove the VAE from memory once done.
|
||||
* `--use_8bit_adam`: When enabled, we will use the 8bit version of AdamW provided by the `bitsandbytes` library.
|
||||
|
||||
Refer to the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwenimage) of the `QwenImagePipeline` to know more about the models available under the SANA family and their preferred dtypes during inference.
|
||||
|
||||
## Using quantization
|
||||
|
||||
You can quantize the base model with [`bitsandbytes`](https://huggingface.co/docs/bitsandbytes/index) to reduce memory usage. To do so, pass a JSON file path to `--bnb_quantization_config_path`. This file should hold the configuration to initialize `BitsAndBytesConfig`. Below is an example JSON file:
|
||||
|
||||
```json
|
||||
{
|
||||
"load_in_4bit": true,
|
||||
"bnb_4bit_quant_type": "nf4"
|
||||
}
|
||||
```
|
||||
@@ -101,7 +101,7 @@ accelerate launch train_dreambooth_lora_sana.py \
|
||||
For using `push_to_hub`, make you're logged into your Hugging Face account:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
@@ -8,7 +8,7 @@ The `train_dreambooth_sd3.py` script shows how to implement the training procedu
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
|
||||
|
||||
@@ -0,0 +1,248 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import safetensors
|
||||
|
||||
from diffusers.loaders.lora_base import LORA_ADAPTER_METADATA_KEY
|
||||
|
||||
|
||||
sys.path.append("..")
|
||||
from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger()
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
class DreamBoothLoRAQwenImage(ExamplesTestsAccelerate):
|
||||
instance_data_dir = "docs/source/en/imgs"
|
||||
instance_prompt = "photo"
|
||||
pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-pipe"
|
||||
script_path = "examples/dreambooth/train_dreambooth_lora_qwen_image.py"
|
||||
transformer_layer_type = "transformer_blocks.0.attn.to_k"
|
||||
|
||||
def test_dreambooth_lora_qwen(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
|
||||
--instance_data_dir {self.instance_data_dir}
|
||||
--instance_prompt {self.instance_prompt}
|
||||
--resolution 64
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 2
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--output_dir {tmpdir}
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
# save_pretrained smoke test
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
|
||||
|
||||
# make sure the state_dict has the correct naming in the parameters.
|
||||
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
|
||||
is_lora = all("lora" in k for k in lora_state_dict.keys())
|
||||
self.assertTrue(is_lora)
|
||||
|
||||
# when not training the text encoder, all the parameters in the state dict should start
|
||||
# with `"transformer"` in their names.
|
||||
starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
|
||||
self.assertTrue(starts_with_transformer)
|
||||
|
||||
def test_dreambooth_lora_latent_caching(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
|
||||
--instance_data_dir {self.instance_data_dir}
|
||||
--instance_prompt {self.instance_prompt}
|
||||
--resolution 64
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 2
|
||||
--cache_latents
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--output_dir {tmpdir}
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
# save_pretrained smoke test
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
|
||||
|
||||
# make sure the state_dict has the correct naming in the parameters.
|
||||
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
|
||||
is_lora = all("lora" in k for k in lora_state_dict.keys())
|
||||
self.assertTrue(is_lora)
|
||||
|
||||
# when not training the text encoder, all the parameters in the state dict should start
|
||||
# with `"transformer"` in their names.
|
||||
starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
|
||||
self.assertTrue(starts_with_transformer)
|
||||
|
||||
def test_dreambooth_lora_layers(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
|
||||
--instance_data_dir {self.instance_data_dir}
|
||||
--instance_prompt {self.instance_prompt}
|
||||
--resolution 64
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 2
|
||||
--cache_latents
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lora_layers {self.transformer_layer_type}
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--output_dir {tmpdir}
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
# save_pretrained smoke test
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
|
||||
|
||||
# make sure the state_dict has the correct naming in the parameters.
|
||||
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
|
||||
is_lora = all("lora" in k for k in lora_state_dict.keys())
|
||||
self.assertTrue(is_lora)
|
||||
|
||||
# when not training the text encoder, all the parameters in the state dict should start
|
||||
# with `"transformer"` in their names. In this test, we only params of
|
||||
# transformer.transformer_blocks.0.attn.to_k should be in the state dict
|
||||
starts_with_transformer = all(
|
||||
key.startswith(f"transformer.{self.transformer_layer_type}") for key in lora_state_dict.keys()
|
||||
)
|
||||
self.assertTrue(starts_with_transformer)
|
||||
|
||||
def test_dreambooth_lora_qwen_checkpointing_checkpoints_total_limit(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path={self.pretrained_model_name_or_path}
|
||||
--instance_data_dir={self.instance_data_dir}
|
||||
--output_dir={tmpdir}
|
||||
--instance_prompt={self.instance_prompt}
|
||||
--resolution=64
|
||||
--train_batch_size=1
|
||||
--gradient_accumulation_steps=1
|
||||
--max_train_steps=6
|
||||
--checkpoints_total_limit=2
|
||||
--checkpointing_steps=2
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-4", "checkpoint-6"},
|
||||
)
|
||||
|
||||
def test_dreambooth_lora_qwen_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path={self.pretrained_model_name_or_path}
|
||||
--instance_data_dir={self.instance_data_dir}
|
||||
--output_dir={tmpdir}
|
||||
--instance_prompt={self.instance_prompt}
|
||||
--resolution=64
|
||||
--train_batch_size=1
|
||||
--gradient_accumulation_steps=1
|
||||
--max_train_steps=4
|
||||
--checkpointing_steps=2
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
|
||||
self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
|
||||
|
||||
resume_run_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path={self.pretrained_model_name_or_path}
|
||||
--instance_data_dir={self.instance_data_dir}
|
||||
--output_dir={tmpdir}
|
||||
--instance_prompt={self.instance_prompt}
|
||||
--resolution=64
|
||||
--train_batch_size=1
|
||||
--gradient_accumulation_steps=1
|
||||
--max_train_steps=8
|
||||
--checkpointing_steps=2
|
||||
--resume_from_checkpoint=checkpoint-4
|
||||
--checkpoints_total_limit=2
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + resume_run_args)
|
||||
|
||||
self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
|
||||
|
||||
def test_dreambooth_lora_with_metadata(self):
|
||||
# Use a `lora_alpha` that is different from `rank`.
|
||||
lora_alpha = 8
|
||||
rank = 4
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
{self.script_path}
|
||||
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
|
||||
--instance_data_dir {self.instance_data_dir}
|
||||
--instance_prompt {self.instance_prompt}
|
||||
--resolution 64
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 2
|
||||
--lora_alpha={lora_alpha}
|
||||
--rank={rank}
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--output_dir {tmpdir}
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
# save_pretrained smoke test
|
||||
state_dict_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
|
||||
self.assertTrue(os.path.isfile(state_dict_file))
|
||||
|
||||
# Check if the metadata was properly serialized.
|
||||
with safetensors.torch.safe_open(state_dict_file, framework="pt", device="cpu") as f:
|
||||
metadata = f.metadata() or {}
|
||||
|
||||
metadata.pop("format", None)
|
||||
raw = metadata.get(LORA_ADAPTER_METADATA_KEY)
|
||||
if raw:
|
||||
raw = json.loads(raw)
|
||||
|
||||
loaded_lora_alpha = raw["transformer.lora_alpha"]
|
||||
self.assertTrue(loaded_lora_alpha == lora_alpha)
|
||||
loaded_lora_rank = raw["transformer.r"]
|
||||
self.assertTrue(loaded_lora_rank == rank)
|
||||
@@ -807,7 +807,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=0.31.0",
|
||||
# "transformers>=4.41.2",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.11.1",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import gc
|
||||
@@ -1013,7 +1027,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -756,7 +756,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=0.31.0",
|
||||
# "transformers>=4.41.2",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.11.1",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
@@ -1051,7 +1065,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -1199,7 +1199,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
@@ -1614,7 +1614,7 @@ def main(args):
|
||||
)
|
||||
if args.cond_image_column is not None:
|
||||
logger.info("I2I fine-tuning enabled.")
|
||||
batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=False)
|
||||
batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=True)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
|
||||
@@ -58,6 +58,7 @@ from diffusers.training_utils import (
|
||||
compute_density_for_timestep_sampling,
|
||||
compute_loss_weighting_for_sd3,
|
||||
free_memory,
|
||||
offload_models,
|
||||
)
|
||||
from diffusers.utils import (
|
||||
check_min_version,
|
||||
@@ -935,7 +936,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
@@ -1364,43 +1365,34 @@ def main(args):
|
||||
# provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
|
||||
# the redundant encoding.
|
||||
if not train_dataset.custom_instance_prompts:
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
|
||||
(
|
||||
instance_prompt_hidden_states_t5,
|
||||
instance_prompt_hidden_states_llama3,
|
||||
instance_pooled_prompt_embeds,
|
||||
_,
|
||||
_,
|
||||
_,
|
||||
) = compute_text_embeddings(args.instance_prompt, text_encoding_pipeline)
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
|
||||
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
|
||||
(
|
||||
instance_prompt_hidden_states_t5,
|
||||
instance_prompt_hidden_states_llama3,
|
||||
instance_pooled_prompt_embeds,
|
||||
_,
|
||||
_,
|
||||
_,
|
||||
) = compute_text_embeddings(args.instance_prompt, text_encoding_pipeline)
|
||||
|
||||
# Handle class prompt for prior-preservation.
|
||||
if args.with_prior_preservation:
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
|
||||
(class_prompt_hidden_states_t5, class_prompt_hidden_states_llama3, class_pooled_prompt_embeds, _, _, _) = (
|
||||
compute_text_embeddings(args.class_prompt, text_encoding_pipeline)
|
||||
)
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
|
||||
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
|
||||
(class_prompt_hidden_states_t5, class_prompt_hidden_states_llama3, class_pooled_prompt_embeds, _, _, _) = (
|
||||
compute_text_embeddings(args.class_prompt, text_encoding_pipeline)
|
||||
)
|
||||
|
||||
validation_embeddings = {}
|
||||
if args.validation_prompt is not None:
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
|
||||
(
|
||||
validation_embeddings["prompt_embeds_t5"],
|
||||
validation_embeddings["prompt_embeds_llama3"],
|
||||
validation_embeddings["pooled_prompt_embeds"],
|
||||
validation_embeddings["negative_prompt_embeds_t5"],
|
||||
validation_embeddings["negative_prompt_embeds_llama3"],
|
||||
validation_embeddings["negative_pooled_prompt_embeds"],
|
||||
) = compute_text_embeddings(args.validation_prompt, text_encoding_pipeline)
|
||||
if args.offload:
|
||||
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
|
||||
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
|
||||
(
|
||||
validation_embeddings["prompt_embeds_t5"],
|
||||
validation_embeddings["prompt_embeds_llama3"],
|
||||
validation_embeddings["pooled_prompt_embeds"],
|
||||
validation_embeddings["negative_prompt_embeds_t5"],
|
||||
validation_embeddings["negative_prompt_embeds_llama3"],
|
||||
validation_embeddings["negative_pooled_prompt_embeds"],
|
||||
) = compute_text_embeddings(args.validation_prompt, text_encoding_pipeline)
|
||||
|
||||
# If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
|
||||
# pack the statically computed variables appropriately here. This is so that we don't
|
||||
@@ -1581,12 +1573,10 @@ def main(args):
|
||||
if args.cache_latents:
|
||||
model_input = latents_cache[step].sample()
|
||||
else:
|
||||
if args.offload:
|
||||
vae = vae.to(accelerator.device)
|
||||
pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
|
||||
with offload_models(vae, device=accelerator.device, offload=args.offload):
|
||||
pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
|
||||
model_input = vae.encode(pixel_values).latent_dist.sample()
|
||||
if args.offload:
|
||||
vae = vae.to("cpu")
|
||||
|
||||
model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
|
||||
model_input = model_input.to(dtype=weight_dtype)
|
||||
|
||||
|
||||
@@ -859,7 +859,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "diffusers @ git+https://github.com/huggingface/diffusers.git",
|
||||
# "torch>=2.0.0",
|
||||
# "accelerate>=1.0.0",
|
||||
# "transformers>=4.47.0",
|
||||
# "ftfy",
|
||||
# "tensorboard",
|
||||
# "Jinja2",
|
||||
# "peft>=0.14.0",
|
||||
# "sentencepiece",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
@@ -852,7 +866,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -1063,7 +1063,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -983,7 +983,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if args.do_edm_style_training and args.snr_gamma is not None:
|
||||
|
||||
@@ -988,7 +988,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
|
||||
|
||||
@@ -13,7 +13,7 @@ To incorporate additional condition latents, we expand the input features of Flu
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
The example command below shows how to launch fine-tuning for pose conditions. The dataset ([`raulc0399/open_pose_controlnet`](https://huggingface.co/datasets/raulc0399/open_pose_controlnet)) being used here already has the pose conditions of the original images, so we don't have to compute them.
|
||||
|
||||
@@ -697,7 +697,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_out_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -725,7 +725,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
if args.use_lora_bias and args.gaussian_init_lora:
|
||||
raise ValueError("`gaussian` LoRA init scheme isn't supported when `use_lora_bias` is True.")
|
||||
|
||||
@@ -430,7 +430,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if args.non_ema_revision is not None:
|
||||
|
||||
@@ -483,7 +483,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
if args.non_ema_revision is not None:
|
||||
|
||||
@@ -41,7 +41,7 @@ For all our examples, we will directly store the trained weights on the Hub, so
|
||||
Run the following command to authenticate your token
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
hf auth login
|
||||
```
|
||||
|
||||
We also use [Weights and Biases](https://docs.wandb.ai/quickstart) logging by default, because it is really useful to monitor the training progress by regularly generating sample images during training. To install wandb, run
|
||||
|
||||
@@ -444,7 +444,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -330,7 +330,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -342,7 +342,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -445,7 +445,7 @@ def main():
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -1249,7 +1249,7 @@ class EasyPipelineForText2Image(AutoPipelineForText2Image):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login`.
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -1358,7 +1358,7 @@ class EasyPipelineForText2Image(AutoPipelineForText2Image):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login`.
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -1507,7 +1507,7 @@ class EasyPipelineForImage2Image(AutoPipelineForImage2Image):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login`.
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -1617,7 +1617,7 @@ class EasyPipelineForImage2Image(AutoPipelineForImage2Image):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login`.
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -1766,7 +1766,7 @@ class EasyPipelineForInpainting(AutoPipelineForInpainting):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -1875,7 +1875,7 @@ class EasyPipelineForInpainting(AutoPipelineForInpainting):
|
||||
<Tip>
|
||||
|
||||
To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
|
||||
`huggingface-cli login`.
|
||||
`hf auth login
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -568,7 +568,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -789,7 +789,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
|
||||
|
||||
@@ -899,7 +899,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -470,7 +470,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -512,7 +512,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -502,7 +502,7 @@ def main(args):
|
||||
if args.report_to == "wandb" and args.hub_token is not None:
|
||||
raise ValueError(
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
" Please use `hf auth login` to authenticate with the Hub."
|
||||
)
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user