Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6530f8d592 | |||
| 6cc1f9137e |
@@ -1,7 +1,4 @@
|
||||
contact_links:
|
||||
- name: Blank issue
|
||||
url: https://github.com/huggingface/diffusers/issues/new
|
||||
about: Other
|
||||
- name: Forum
|
||||
url: https://discuss.huggingface.co/
|
||||
about: General usage questions and community discussions
|
||||
about: General usage questions and community discussions
|
||||
|
||||
@@ -13,7 +13,5 @@ jobs:
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: diffusers
|
||||
notebook_folder: diffusers_doc
|
||||
languages: en ko
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
|
||||
@@ -14,4 +14,3 @@ jobs:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: diffusers
|
||||
languages: en ko
|
||||
|
||||
@@ -27,8 +27,9 @@ jobs:
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
black --check examples tests src utils scripts
|
||||
ruff examples tests src utils scripts
|
||||
black --check --preview examples tests src utils scripts
|
||||
isort --check-only examples tests src utils scripts
|
||||
flake8 examples tests src utils scripts
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
|
||||
check_repository_consistency:
|
||||
@@ -47,4 +48,3 @@ jobs:
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
|
||||
@@ -36,11 +36,6 @@ jobs:
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
@@ -95,13 +90,6 @@ jobs:
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples/test_examples.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
name: Slow tests on main
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: no
|
||||
|
||||
jobs:
|
||||
run_fast_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Fast PyTorch CPU tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
- name: Fast Flax CPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: Fast ONNXRuntime CPU tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast ONNXRuntime CPU tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples/test_examples.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
run_fast_tests_apple_m1:
|
||||
name: Fast PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Clean checkout
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
git clean -fxd
|
||||
|
||||
- name: Setup miniconda
|
||||
uses: ./.github/actions/setup-miniconda
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
|
||||
${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch tests on M1 (MPS)
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_mps_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_torch_mps_test_reports
|
||||
path: reports
|
||||
@@ -169,6 +169,3 @@ tags
|
||||
|
||||
# dependencies
|
||||
/transformers
|
||||
|
||||
# ruff
|
||||
.ruff_cache
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
cff-version: 1.2.0
|
||||
title: 'Diffusers: State-of-the-art diffusion models'
|
||||
message: >-
|
||||
If you use this software, please cite it using the
|
||||
metadata from this file.
|
||||
type: software
|
||||
authors:
|
||||
- given-names: Patrick
|
||||
family-names: von Platen
|
||||
- given-names: Suraj
|
||||
family-names: Patil
|
||||
- given-names: Anton
|
||||
family-names: Lozhkov
|
||||
- given-names: Pedro
|
||||
family-names: Cuenca
|
||||
- given-names: Nathan
|
||||
family-names: Lambert
|
||||
- given-names: Kashif
|
||||
family-names: Rasul
|
||||
- given-names: Mishig
|
||||
family-names: Davaadorj
|
||||
- given-names: Thomas
|
||||
family-names: Wolf
|
||||
repository-code: 'https://github.com/huggingface/diffusers'
|
||||
abstract: >-
|
||||
Diffusers provides pretrained diffusion models across
|
||||
multiple modalities, such as vision and audio, and serves
|
||||
as a modular toolbox for inference and training of
|
||||
diffusion models.
|
||||
keywords:
|
||||
- deep-learning
|
||||
- pytorch
|
||||
- image-generation
|
||||
- diffusion
|
||||
- text2image
|
||||
- image2image
|
||||
- score-based-generative-modeling
|
||||
- stable-diffusion
|
||||
license: Apache-2.0
|
||||
version: 0.12.1
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
<!---
|
||||
Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
|
||||
$ make style
|
||||
```
|
||||
|
||||
🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
|
||||
🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -9,8 +9,9 @@ modified_only_fixup:
|
||||
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
|
||||
@if test -n "$(modified_py_files)"; then \
|
||||
echo "Checking/fixing $(modified_py_files)"; \
|
||||
black $(modified_py_files); \
|
||||
ruff $(modified_py_files); \
|
||||
black --preview $(modified_py_files); \
|
||||
isort $(modified_py_files); \
|
||||
flake8 $(modified_py_files); \
|
||||
else \
|
||||
echo "No library .py files were modified"; \
|
||||
fi
|
||||
@@ -40,23 +41,22 @@ repo-consistency:
|
||||
# this target runs checks on all files
|
||||
|
||||
quality:
|
||||
black --check $(check_dirs)
|
||||
ruff $(check_dirs)
|
||||
black --check --preview $(check_dirs)
|
||||
isort --check-only $(check_dirs)
|
||||
flake8 $(check_dirs)
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
# Format source code automatically and check is there are any problems left that need manual fixing
|
||||
|
||||
extra_style_checks:
|
||||
python utils/custom_init_isort.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --path_to_docs docs/source
|
||||
python utils/check_doc_toc.py --fix_and_overwrite
|
||||
|
||||
# this target runs checks on all files and potentially modifies some of them
|
||||
|
||||
style:
|
||||
black $(check_dirs)
|
||||
ruff $(check_dirs) --fix
|
||||
black --preview $(check_dirs)
|
||||
isort $(check_dirs)
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="./docs/source/en/imgs/diffusers_library.jpg" width="400"/>
|
||||
<img src="https://github.com/huggingface/diffusers/raw/main/docs/source/imgs/diffusers_library.jpg" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
<p align="center">
|
||||
@@ -284,53 +284,6 @@ output = pipeline(
|
||||
output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
|
||||
```
|
||||
|
||||
Diffusers also has a Text-guided inpainting pipeline with Flax/Jax
|
||||
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
import PIL
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
from diffusers import FlaxStableDiffusionInpaintPipeline
|
||||
|
||||
def download_image(url):
|
||||
response = requests.get(url)
|
||||
return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained("xvjiarui/stable-diffusion-2-inpainting")
|
||||
|
||||
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
init_image = num_samples * [init_image]
|
||||
mask_image = num_samples * [mask_image]
|
||||
prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image)
|
||||
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
processed_masked_images = shard(processed_masked_images)
|
||||
processed_masks = shard(processed_masks)
|
||||
|
||||
images = pipeline(prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
### Image-to-Image text-guided generation with Stable Diffusion
|
||||
|
||||
The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
|
||||
@@ -467,12 +420,12 @@ image.save("ddpm_generated_image.png")
|
||||
- [Unconditional Diffusion with continuous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)
|
||||
|
||||
**Other Image Notebooks**:
|
||||
* [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ,
|
||||
* [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ,
|
||||
* [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ,
|
||||
* [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ,
|
||||
|
||||
**Diffusers for Other Modalities**:
|
||||
* [Molecule conformation generation](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) ,
|
||||
* [Model-based reinforcement learning](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ,
|
||||
* [Molecule conformation generation](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) ,
|
||||
* [Model-based reinforcement learning](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ,
|
||||
|
||||
### Web Demos
|
||||
If you just want to play around with some web demos, you can try out the following 🚀 Spaces:
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -36,8 +36,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
<!---
|
||||
Copyright 2023- The HuggingFace Team. All rights reserved.
|
||||
Copyright 2022- The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@@ -54,7 +54,7 @@ doc-builder preview {package_name} {path_to_docs}
|
||||
For example:
|
||||
|
||||
```bash
|
||||
doc-builder preview diffusers docs/source/en
|
||||
doc-builder preview diffusers docs/source/
|
||||
```
|
||||
|
||||
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
### Translating the Diffusers documentation into your language
|
||||
|
||||
As part of our mission to democratize machine learning, we'd love to make the Diffusers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
|
||||
|
||||
**🗞️ Open an issue**
|
||||
|
||||
To get started, navigate to the [Issues](https://github.com/huggingface/diffusers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button.
|
||||
|
||||
Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
|
||||
|
||||
|
||||
**🍴 Fork the repository**
|
||||
|
||||
First, you'll need to [fork the Diffusers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
|
||||
|
||||
Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/YOUR-USERNAME/diffusers.git
|
||||
```
|
||||
|
||||
**📋 Copy-paste the English version with a new language code**
|
||||
|
||||
The documentation files are in one leading directory:
|
||||
|
||||
- [`docs/source`](https://github.com/huggingface/diffusers/tree/main/docs/source): All the documentation materials are organized here by language.
|
||||
|
||||
You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/diffusers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
|
||||
|
||||
```bash
|
||||
cd ~/path/to/diffusers/docs
|
||||
cp -r source/en source/LANG-ID
|
||||
```
|
||||
|
||||
Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
|
||||
|
||||
**✍️ Start translating**
|
||||
|
||||
The fun part comes - translating the text!
|
||||
|
||||
The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website.
|
||||
|
||||
> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory!
|
||||
|
||||
The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/diffusers/blob/main/docs/source/en/_toctree.yml):
|
||||
|
||||
```yaml
|
||||
- sections:
|
||||
- local: pipeline_tutorial # Do not change this! Use the same name for your .md file
|
||||
title: Pipelines for inference # Translate this!
|
||||
...
|
||||
title: Tutorials # Translate this!
|
||||
```
|
||||
|
||||
Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
|
||||
|
||||
> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/diffusers/issues) and tag @patrickvonplaten.
|
||||
@@ -1,9 +0,0 @@
|
||||
# docstyle-ignore
|
||||
INSTALL_CONTENT = """
|
||||
# Diffusers installation
|
||||
! pip install diffusers transformers datasets accelerate
|
||||
# To install from source instead of the last release, comment the command above and uncomment the following one.
|
||||
# ! pip install git+https://github.com/huggingface/diffusers.git
|
||||
"""
|
||||
|
||||
notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
|
||||
@@ -0,0 +1,193 @@
|
||||
- sections:
|
||||
- local: index
|
||||
title: "🧨 Diffusers"
|
||||
- local: quicktour
|
||||
title: "Quicktour"
|
||||
- local: installation
|
||||
title: "Installation"
|
||||
title: "Get started"
|
||||
- sections:
|
||||
- sections:
|
||||
- local: using-diffusers/loading
|
||||
title: "Loading Pipelines, Models, and Schedulers"
|
||||
- local: using-diffusers/schedulers
|
||||
title: "Using different Schedulers"
|
||||
- local: using-diffusers/configuration
|
||||
title: "Configuring Pipelines, Models, and Schedulers"
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: "Loading and Adding Custom Pipelines"
|
||||
title: "Loading & Hub"
|
||||
- sections:
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: "Unconditional Image Generation"
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: "Text-to-Image Generation"
|
||||
- local: using-diffusers/img2img
|
||||
title: "Text-Guided Image-to-Image"
|
||||
- local: using-diffusers/inpaint
|
||||
title: "Text-Guided Image-Inpainting"
|
||||
- local: using-diffusers/depth2img
|
||||
title: "Text-Guided Depth-to-Image"
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: "Reusing seeds for deterministic generation"
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: "Community Pipelines"
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: "How to contribute a Pipeline"
|
||||
title: "Pipelines for Inference"
|
||||
- sections:
|
||||
- local: using-diffusers/rl
|
||||
title: "Reinforcement Learning"
|
||||
- local: using-diffusers/audio
|
||||
title: "Audio"
|
||||
- local: using-diffusers/other-modalities
|
||||
title: "Other Modalities"
|
||||
title: "Taking Diffusers Beyond Images"
|
||||
title: "Using Diffusers"
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: "Memory and Speed"
|
||||
- local: optimization/xformers
|
||||
title: "xFormers"
|
||||
- local: optimization/onnx
|
||||
title: "ONNX"
|
||||
- local: optimization/open_vino
|
||||
title: "OpenVINO"
|
||||
- local: optimization/mps
|
||||
title: "MPS"
|
||||
- local: optimization/habana
|
||||
title: "Habana Gaudi"
|
||||
title: "Optimization/Special Hardware"
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: "Overview"
|
||||
- local: training/unconditional_training
|
||||
title: "Unconditional Image Generation"
|
||||
- local: training/text_inversion
|
||||
title: "Textual Inversion"
|
||||
- local: training/dreambooth
|
||||
title: "Dreambooth"
|
||||
- local: training/text2image
|
||||
title: "Text-to-image fine-tuning"
|
||||
title: "Training"
|
||||
- sections:
|
||||
- local: conceptual/stable_diffusion
|
||||
title: "Stable Diffusion"
|
||||
- local: conceptual/philosophy
|
||||
title: "Philosophy"
|
||||
- local: conceptual/contribution
|
||||
title: "How to contribute?"
|
||||
title: "Conceptual Guides"
|
||||
- sections:
|
||||
- sections:
|
||||
- local: api/models
|
||||
title: "Models"
|
||||
- local: api/diffusion_pipeline
|
||||
title: "Diffusion Pipeline"
|
||||
- local: api/logging
|
||||
title: "Logging"
|
||||
- local: api/configuration
|
||||
title: "Configuration"
|
||||
- local: api/outputs
|
||||
title: "Outputs"
|
||||
title: "Main Classes"
|
||||
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: "Overview"
|
||||
- local: api/pipelines/alt_diffusion
|
||||
title: "AltDiffusion"
|
||||
- local: api/pipelines/cycle_diffusion
|
||||
title: "Cycle Diffusion"
|
||||
- local: api/pipelines/ddim
|
||||
title: "DDIM"
|
||||
- local: api/pipelines/ddpm
|
||||
title: "DDPM"
|
||||
- local: api/pipelines/latent_diffusion
|
||||
title: "Latent Diffusion"
|
||||
- local: api/pipelines/latent_diffusion_uncond
|
||||
title: "Unconditional Latent Diffusion"
|
||||
- local: api/pipelines/paint_by_example
|
||||
title: "PaintByExample"
|
||||
- local: api/pipelines/pndm
|
||||
title: "PNDM"
|
||||
- local: api/pipelines/score_sde_ve
|
||||
title: "Score SDE VE"
|
||||
- sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: "Overview"
|
||||
- local: api/pipelines/stable_diffusion/text2img
|
||||
title: "Text-to-Image"
|
||||
- local: api/pipelines/stable_diffusion/img2img
|
||||
title: "Image-to-Image"
|
||||
- local: api/pipelines/stable_diffusion/inpaint
|
||||
title: "Inpaint"
|
||||
- local: api/pipelines/stable_diffusion/depth2img
|
||||
title: "Depth-to-Image"
|
||||
- local: api/pipelines/stable_diffusion/image_variation
|
||||
title: "Image-Variation"
|
||||
- local: api/pipelines/stable_diffusion/upscale
|
||||
title: "Super-Resolution"
|
||||
title: "Stable Diffusion"
|
||||
- local: api/pipelines/stable_diffusion_2
|
||||
title: "Stable Diffusion 2"
|
||||
- local: api/pipelines/stable_diffusion_safe
|
||||
title: "Safe Stable Diffusion"
|
||||
- local: api/pipelines/stochastic_karras_ve
|
||||
title: "Stochastic Karras VE"
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: "Dance Diffusion"
|
||||
- local: api/pipelines/unclip
|
||||
title: "UnCLIP"
|
||||
- local: api/pipelines/versatile_diffusion
|
||||
title: "Versatile Diffusion"
|
||||
- local: api/pipelines/vq_diffusion
|
||||
title: "VQ Diffusion"
|
||||
- local: api/pipelines/repaint
|
||||
title: "RePaint"
|
||||
- local: api/pipelines/audio_diffusion
|
||||
title: "Audio Diffusion"
|
||||
title: "Pipelines"
|
||||
- sections:
|
||||
- local: api/schedulers/overview
|
||||
title: "Overview"
|
||||
- local: api/schedulers/ddim
|
||||
title: "DDIM"
|
||||
- local: api/schedulers/ddpm
|
||||
title: "DDPM"
|
||||
- local: api/schedulers/singlestep_dpm_solver
|
||||
title: "Singlestep DPM-Solver"
|
||||
- local: api/schedulers/multistep_dpm_solver
|
||||
title: "Multistep DPM-Solver"
|
||||
- local: api/schedulers/heun
|
||||
title: "Heun Scheduler"
|
||||
- local: api/schedulers/dpm_discrete
|
||||
title: "DPM Discrete Scheduler"
|
||||
- local: api/schedulers/dpm_discrete_ancestral
|
||||
title: "DPM Discrete Scheduler with ancestral sampling"
|
||||
- local: api/schedulers/stochastic_karras_ve
|
||||
title: "Stochastic Kerras VE"
|
||||
- local: api/schedulers/lms_discrete
|
||||
title: "Linear Multistep"
|
||||
- local: api/schedulers/pndm
|
||||
title: "PNDM"
|
||||
- local: api/schedulers/score_sde_ve
|
||||
title: "VE-SDE"
|
||||
- local: api/schedulers/ipndm
|
||||
title: "IPNDM"
|
||||
- local: api/schedulers/score_sde_vp
|
||||
title: "VP-SDE"
|
||||
- local: api/schedulers/euler
|
||||
title: "Euler scheduler"
|
||||
- local: api/schedulers/euler_ancestral
|
||||
title: "Euler Ancestral Scheduler"
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: "VQDiffusionScheduler"
|
||||
- local: api/schedulers/repaint
|
||||
title: "RePaint Scheduler"
|
||||
title: "Schedulers"
|
||||
- sections:
|
||||
- local: api/experimental/rl
|
||||
title: "RL Planning"
|
||||
title: "Experimental Features"
|
||||
title: "API"
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -34,7 +34,6 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain
|
||||
- __call__
|
||||
- device
|
||||
- to
|
||||
- components
|
||||
|
||||
## ImagePipelineOutput
|
||||
By default diffusion pipelines return an object of class
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -64,12 +64,6 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
|
||||
## PriorTransformerOutput
|
||||
[[autodoc]] models.prior_transformer.PriorTransformerOutput
|
||||
|
||||
## ControlNetOutput
|
||||
[[autodoc]] models.controlnet.ControlNetOutput
|
||||
|
||||
## ControlNetModel
|
||||
[[autodoc]] ControlNetModel
|
||||
|
||||
## FlaxModelMixin
|
||||
[[autodoc]] FlaxModelMixin
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -46,7 +46,6 @@ available a colab notebook to directly try them out.
|
||||
|---|---|:---:|:---:|
|
||||
| [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
|
||||
| [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
|
||||
| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
|
||||
| [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
@@ -58,24 +57,13 @@ available a colab notebook to directly try them out.
|
||||
| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [semantic_stable_diffusion](./semantic_stable_diffusion) | [**SEGA: Instructing Diffusion using Semantic Dimensions**](https://arxiv.org/abs/2301.12247) | Text-to-Image Generation |
|
||||
| [stable_diffusion_text2img](./stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion_img2img](./stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion_inpaint](./stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_panorama](./stable_diffusion/panorama) | [**MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation**](https://arxiv.org/abs/2302.08113) | Text-Guided Panorama View Generation |
|
||||
| [stable_diffusion_pix2pix](./stable_diffusion/pix2pix) | [**InstructPix2Pix: Learning to Follow Image Editing Instructions**](https://arxiv.org/abs/2211.09800) | Text-Based Image Editing |
|
||||
| [stable_diffusion_pix2pix_zero](./stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://arxiv.org/abs/2302.03027) | Text-Based Image Editing |
|
||||
| [stable_diffusion_attend_and_excite](./stable_diffusion/attend_and_excite) | [**Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models**](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation |
|
||||
| [stable_diffusion_self_attention_guidance](./stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation |
|
||||
| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
|
||||
| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_2](./stable_diffusion_2/) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Depth-to-Image Text-Guided Generation |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
|
||||
| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-4
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -20,9 +20,6 @@ The original codebase can be found here: [CampVis/stable-diffusion](https://gith
|
||||
|
||||
[`StableDiffusionImg2ImgPipeline`] is compatible with all Stable Diffusion checkpoints for [Text-to-Image](./text2img)
|
||||
|
||||
The pipeline uses the diffusion-denoising mechanism proposed by SDEdit ([SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations](https://arxiv.org/abs/2108.01073)
|
||||
proposed by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, Stefano Ermon).
|
||||
|
||||
[[autodoc]] StableDiffusionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+57
-5
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -31,10 +31,6 @@ For more details about how Stable Diffusion works and how it differs from the ba
|
||||
| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon
|
||||
| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations)
|
||||
| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
|
||||
| [StableDiffusionLatentUpscalePipeline](./latent_upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
|
||||
| [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix)
|
||||
| [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
|
||||
| [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027)
|
||||
|
||||
|
||||
|
||||
@@ -79,3 +75,59 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
## StableDiffusionPipeline
|
||||
[[autodoc]] StableDiffusionPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
|
||||
|
||||
## StableDiffusionImg2ImgPipeline
|
||||
[[autodoc]] StableDiffusionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionInpaintPipeline
|
||||
[[autodoc]] StableDiffusionInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionDepth2ImgPipeline
|
||||
[[autodoc]] StableDiffusionDepth2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionImageVariationPipeline
|
||||
[[autodoc]] StableDiffusionImageVariationPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionUpscalePipeline
|
||||
[[autodoc]] StableDiffusionUpscalePipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
+3
-5
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -17,7 +17,7 @@ specific language governing permissions and limitations under the License.
|
||||
The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion.
|
||||
|
||||
The original codebase can be found here:
|
||||
- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
||||
- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
||||
- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion)
|
||||
|
||||
Available Checkpoints are:
|
||||
@@ -36,6 +36,4 @@ Available Checkpoints are:
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- enable_vae_tiling
|
||||
- disable_vae_tiling
|
||||
- disable_xformers_memory_efficient_attention
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+3
-3
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -24,7 +24,7 @@ The abstract of the paper is the following:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | [](https://huggingface.co/spaces/AIML-TUDA/unsafe-vs-safe-stable-diffusion)
|
||||
| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | -
|
||||
|
||||
## Tips
|
||||
|
||||
@@ -58,7 +58,7 @@ You may use the 4 configurations defined in the [Safe Latent Diffusion paper](ht
|
||||
>>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
|
||||
```
|
||||
|
||||
The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONG`, and `SafetyConfig.MAX`.
|
||||
The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONg`, and `SafetyConfig.MAX`.
|
||||
|
||||
### How to load and use different schedulers.
|
||||
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+2
-11
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -37,18 +37,16 @@ To this end, the design of schedulers is such that:
|
||||
|
||||
- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
|
||||
- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
|
||||
- Many diffusion pipelines, such as [`StableDiffusionPipeline`] and [`DiTPipeline`] can use any of [`KarrasDiffusionSchedulers`]
|
||||
|
||||
## Schedulers Summary
|
||||
|
||||
The following table summarizes all officially supported schedulers, their corresponding paper
|
||||
|
||||
|
||||
| Scheduler | Paper |
|
||||
|---|---|
|
||||
| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
|
||||
| [ddim_inverse](./ddim_inverse) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
|
||||
| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) |
|
||||
| [deis](./deis) | [**DEISMultistepScheduler**](https://arxiv.org/abs/2204.13902) |
|
||||
| [singlestep_dpm_solver](./singlestep_dpm_solver) | [**Singlestep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
|
||||
| [multistep_dpm_solver](./multistep_dpm_solver) | [**Multistep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
|
||||
| [heun](./heun) | [**Heun scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
|
||||
@@ -63,7 +61,6 @@ The following table summarizes all officially supported schedulers, their corres
|
||||
| [euler](./euler) | [**Euler scheduler**](https://arxiv.org/abs/2206.00364) |
|
||||
| [euler_ancestral](./euler_ancestral) | [**Euler Ancestral scheduler**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) |
|
||||
| [vq_diffusion](./vq_diffusion) | [**VQDiffusionScheduler**](https://arxiv.org/abs/2111.14822) |
|
||||
| [unipc](./unipc) | [**UniPCMultistepScheduler**](https://arxiv.org/abs/2302.04867) |
|
||||
| [repaint](./repaint) | [**RePaint scheduler**](https://arxiv.org/abs/2201.09865) |
|
||||
|
||||
## API
|
||||
@@ -83,10 +80,4 @@ The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...
|
||||
|
||||
[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
|
||||
|
||||
### KarrasDiffusionSchedulers
|
||||
|
||||
`KarrasDiffusionSchedulers` encompasses the main generalization of schedulers in Diffusers. The schedulers in this class are distinguished, at a high level, by their noise sampling strategy; the type of network and scaling; and finally the training strategy or how the loss is weighed.
|
||||
|
||||
The different schedulers, depending on the type of ODE solver, fall into the above taxonomy and provide a good abstraction for the design of the main schedulers implemented in Diffusers. The schedulers in this class are given below:
|
||||
|
||||
[[autodoc]] schedulers.scheduling_utils.KarrasDiffusionSchedulers
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
+2
-2
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
|
||||
$ make style
|
||||
```
|
||||
|
||||
🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
|
||||
🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
@@ -0,0 +1,17 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Philosophy
|
||||
|
||||
- Readability and clarity are preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and use well-commented code that can be read alongside the original paper.
|
||||
- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio. This is one of the guiding goals even if the initial pipelines are devoted to vision tasks.
|
||||
- Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementations and can include components of other libraries, such as text encoders. Examples of diffusion pipelines are [Glide](https://github.com/openai/glide-text2im), [Latent Diffusion](https://github.com/CompVis/latent-diffusion) and [Stable Diffusion](https://github.com/compvis/stable-diffusion).
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -10,7 +10,6 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 번역중
|
||||
# Stable Diffusion
|
||||
|
||||
열심히 번역을 진행중입니다. 조금만 기다려주세요.
|
||||
감사합니다!
|
||||
Please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion) on Stable Diffusion!
|
||||
@@ -1,240 +0,0 @@
|
||||
- sections:
|
||||
- local: index
|
||||
title: 🧨 Diffusers
|
||||
- local: quicktour
|
||||
title: Quicktour
|
||||
- local: stable_diffusion
|
||||
title: Stable Diffusion
|
||||
- local: installation
|
||||
title: Installation
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- sections:
|
||||
- local: using-diffusers/loading
|
||||
title: Loading Pipelines, Models, and Schedulers
|
||||
- local: using-diffusers/schedulers
|
||||
title: Using different Schedulers
|
||||
- local: using-diffusers/configuration
|
||||
title: Configuring Pipelines, Models, and Schedulers
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Loading and Adding Custom Pipelines
|
||||
- local: using-diffusers/kerascv
|
||||
title: Using KerasCV Stable Diffusion Checkpoints in Diffusers
|
||||
title: Loading & Hub
|
||||
- sections:
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional Image Generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-Image Generation
|
||||
- local: using-diffusers/img2img
|
||||
title: Text-Guided Image-to-Image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Text-Guided Image-Inpainting
|
||||
- local: using-diffusers/depth2img
|
||||
title: Text-Guided Depth-to-Image
|
||||
- local: using-diffusers/controlling_generation
|
||||
title: Controlling generation
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reusing seeds for deterministic generation
|
||||
- local: using-diffusers/reproducibility
|
||||
title: Reproducibility
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: Community Pipelines
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: How to contribute a Pipeline
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Using safetensors
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Weighting Prompts
|
||||
title: Pipelines for Inference
|
||||
- sections:
|
||||
- local: using-diffusers/rl
|
||||
title: Reinforcement Learning
|
||||
- local: using-diffusers/audio
|
||||
title: Audio
|
||||
- local: using-diffusers/other-modalities
|
||||
title: Other Modalities
|
||||
title: Taking Diffusers Beyond Images
|
||||
title: Using Diffusers
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Memory and Speed
|
||||
- local: optimization/torch2.0
|
||||
title: Torch2.0 support
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/onnx
|
||||
title: ONNX
|
||||
- local: optimization/open_vino
|
||||
title: OpenVINO
|
||||
- local: optimization/mps
|
||||
title: MPS
|
||||
- local: optimization/habana
|
||||
title: Habana Gaudi
|
||||
title: Optimization/Special Hardware
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional Image Generation
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/lora
|
||||
title: Low-Rank Adaptation of Large Language Models (LoRA)
|
||||
title: Training
|
||||
- sections:
|
||||
- local: conceptual/philosophy
|
||||
title: Philosophy
|
||||
- local: conceptual/contribution
|
||||
title: How to contribute?
|
||||
- local: conceptual/ethical_guidelines
|
||||
title: Diffusers' Ethical Guidelines
|
||||
title: Conceptual Guides
|
||||
- sections:
|
||||
- sections:
|
||||
- local: api/models
|
||||
title: Models
|
||||
- local: api/diffusion_pipeline
|
||||
title: Diffusion Pipeline
|
||||
- local: api/logging
|
||||
title: Logging
|
||||
- local: api/configuration
|
||||
title: Configuration
|
||||
- local: api/outputs
|
||||
title: Outputs
|
||||
- local: api/loaders
|
||||
title: Loaders
|
||||
title: Main Classes
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/alt_diffusion
|
||||
title: AltDiffusion
|
||||
- local: api/pipelines/audio_diffusion
|
||||
title: Audio Diffusion
|
||||
- local: api/pipelines/cycle_diffusion
|
||||
title: Cycle Diffusion
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: Dance Diffusion
|
||||
- local: api/pipelines/ddim
|
||||
title: DDIM
|
||||
- local: api/pipelines/ddpm
|
||||
title: DDPM
|
||||
- local: api/pipelines/dit
|
||||
title: DiT
|
||||
- local: api/pipelines/latent_diffusion
|
||||
title: Latent Diffusion
|
||||
- local: api/pipelines/paint_by_example
|
||||
title: PaintByExample
|
||||
- local: api/pipelines/pndm
|
||||
title: PNDM
|
||||
- local: api/pipelines/repaint
|
||||
title: RePaint
|
||||
- local: api/pipelines/stable_diffusion_safe
|
||||
title: Safe Stable Diffusion
|
||||
- local: api/pipelines/score_sde_ve
|
||||
title: Score SDE VE
|
||||
- local: api/pipelines/semantic_stable_diffusion
|
||||
title: Semantic Guidance
|
||||
- sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/stable_diffusion/text2img
|
||||
title: Text-to-Image
|
||||
- local: api/pipelines/stable_diffusion/img2img
|
||||
title: Image-to-Image
|
||||
- local: api/pipelines/stable_diffusion/inpaint
|
||||
title: Inpaint
|
||||
- local: api/pipelines/stable_diffusion/depth2img
|
||||
title: Depth-to-Image
|
||||
- local: api/pipelines/stable_diffusion/image_variation
|
||||
title: Image-Variation
|
||||
- local: api/pipelines/stable_diffusion/upscale
|
||||
title: Super-Resolution
|
||||
- local: api/pipelines/stable_diffusion/latent_upscale
|
||||
title: Stable-Diffusion-Latent-Upscaler
|
||||
- local: api/pipelines/stable_diffusion/pix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: api/pipelines/stable_diffusion/attend_and_excite
|
||||
title: Attend and Excite
|
||||
- local: api/pipelines/stable_diffusion/pix2pix_zero
|
||||
title: Pix2Pix Zero
|
||||
- local: api/pipelines/stable_diffusion/self_attention_guidance
|
||||
title: Self-Attention Guidance
|
||||
- local: api/pipelines/stable_diffusion/panorama
|
||||
title: MultiDiffusion Panorama
|
||||
- local: api/pipelines/stable_diffusion/controlnet
|
||||
title: Text-to-Image Generation with ControlNet Conditioning
|
||||
title: Stable Diffusion
|
||||
- local: api/pipelines/stable_diffusion_2
|
||||
title: Stable Diffusion 2
|
||||
- local: api/pipelines/stable_unclip
|
||||
title: Stable unCLIP
|
||||
- local: api/pipelines/stochastic_karras_ve
|
||||
title: Stochastic Karras VE
|
||||
- local: api/pipelines/unclip
|
||||
title: UnCLIP
|
||||
- local: api/pipelines/latent_diffusion_uncond
|
||||
title: Unconditional Latent Diffusion
|
||||
- local: api/pipelines/versatile_diffusion
|
||||
title: Versatile Diffusion
|
||||
- local: api/pipelines/vq_diffusion
|
||||
title: VQ Diffusion
|
||||
title: Pipelines
|
||||
- sections:
|
||||
- local: api/schedulers/overview
|
||||
title: Overview
|
||||
- local: api/schedulers/ddim
|
||||
title: DDIM
|
||||
- local: api/schedulers/ddim_inverse
|
||||
title: DDIMInverse
|
||||
- local: api/schedulers/ddpm
|
||||
title: DDPM
|
||||
- local: api/schedulers/deis
|
||||
title: DEIS
|
||||
- local: api/schedulers/dpm_discrete
|
||||
title: DPM Discrete Scheduler
|
||||
- local: api/schedulers/dpm_discrete_ancestral
|
||||
title: DPM Discrete Scheduler with ancestral sampling
|
||||
- local: api/schedulers/euler_ancestral
|
||||
title: Euler Ancestral Scheduler
|
||||
- local: api/schedulers/euler
|
||||
title: Euler scheduler
|
||||
- local: api/schedulers/heun
|
||||
title: Heun Scheduler
|
||||
- local: api/schedulers/ipndm
|
||||
title: IPNDM
|
||||
- local: api/schedulers/lms_discrete
|
||||
title: Linear Multistep
|
||||
- local: api/schedulers/multistep_dpm_solver
|
||||
title: Multistep DPM-Solver
|
||||
- local: api/schedulers/pndm
|
||||
title: PNDM
|
||||
- local: api/schedulers/repaint
|
||||
title: RePaint Scheduler
|
||||
- local: api/schedulers/singlestep_dpm_solver
|
||||
title: Singlestep DPM-Solver
|
||||
- local: api/schedulers/stochastic_karras_ve
|
||||
title: Stochastic Kerras VE
|
||||
- local: api/schedulers/unipc
|
||||
title: UniPCMultistepScheduler
|
||||
- local: api/schedulers/score_sde_ve
|
||||
title: VE-SDE
|
||||
- local: api/schedulers/score_sde_vp
|
||||
title: VP-SDE
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: VQDiffusionScheduler
|
||||
title: Schedulers
|
||||
- sections:
|
||||
- local: api/experimental/rl
|
||||
title: RL Planning
|
||||
title: Experimental Features
|
||||
title: API
|
||||
@@ -1,30 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Loaders
|
||||
|
||||
There are many ways to train adapter neural networks for diffusion models, such as
|
||||
- [Textual Inversion](./training/text_inversion.mdx)
|
||||
- [LoRA](https://github.com/cloneofsimo/lora)
|
||||
- [Hypernetworks](https://arxiv.org/abs/1609.09106)
|
||||
|
||||
Such adapter neural networks often only consist of a fraction of the number of weights compared
|
||||
to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use
|
||||
API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py).
|
||||
|
||||
**Note**: This module is still highly experimental and prone to future changes.
|
||||
|
||||
## LoaderMixins
|
||||
|
||||
### UNet2DConditionLoadersMixin
|
||||
|
||||
[[autodoc]] loaders.UNet2DConditionLoadersMixin
|
||||
@@ -1,59 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Scalable Diffusion Models with Transformers (DiT)
|
||||
|
||||
## Overview
|
||||
|
||||
[Scalable Diffusion Models with Transformers](https://arxiv.org/abs/2212.09748) (DiT) by William Peebles and Saining Xie.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We explore a new class of diffusion models based on the transformer architecture. We train latent diffusion models of images, replacing the commonly-used U-Net backbone with a transformer that operates on latent patches. We analyze the scalability of our Diffusion Transformers (DiTs) through the lens of forward pass complexity as measured by Gflops. We find that DiTs with higher Gflops -- through increased transformer depth/width or increased number of input tokens -- consistently have lower FID. In addition to possessing good scalability properties, our largest DiT-XL/2 models outperform all prior diffusion models on the class-conditional ImageNet 512x512 and 256x256 benchmarks, achieving a state-of-the-art FID of 2.27 on the latter.*
|
||||
|
||||
The original codebase of this paper can be found here: [facebookresearch/dit](https://github.com/facebookresearch/dit).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab
|
||||
|---|---|:---:|
|
||||
| [pipeline_dit.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dit/pipeline_dit.py) | *Conditional Image Generation* | - |
|
||||
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
from diffusers import DiTPipeline, DPMSolverMultistepScheduler
|
||||
import torch
|
||||
|
||||
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
# pick words from Imagenet class labels
|
||||
pipe.labels # to print all available words
|
||||
|
||||
# pick words that exist in ImageNet
|
||||
words = ["white shark", "umbrella"]
|
||||
|
||||
class_ids = pipe.get_label_ids(words)
|
||||
|
||||
generator = torch.manual_seed(33)
|
||||
output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
|
||||
|
||||
image = output.images[0] # label 'white shark'
|
||||
```
|
||||
|
||||
## DiTPipeline
|
||||
[[autodoc]] DiTPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,79 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Semantic Guidance
|
||||
|
||||
Semantic Guidance for Diffusion Models was proposed in [SEGA: Instructing Diffusion using Semantic Dimensions](https://arxiv.org/abs/2301.12247) and provides strong semantic control over the image generation.
|
||||
Small changes to the text prompt usually result in entirely different output images. However, with SEGA a variety of changes to the image are enabled that can be controlled easily and intuitively, and stay true to the original image composition.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
|
||||
|
||||
|
||||
*Overview*:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_semantic_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb) | [Coming Soon](https://huggingface.co/AIML-TUDA)
|
||||
|
||||
## Tips
|
||||
|
||||
- The Semantic Guidance pipeline can be used with any [Stable Diffusion](./api/pipelines/stable_diffusion/text2img) checkpoint.
|
||||
|
||||
### Run Semantic Guidance
|
||||
|
||||
The interface of [`SemanticStableDiffusionPipeline`] provides several additional parameters to influence the image generation.
|
||||
Exemplary usage may look like this:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import SemanticStableDiffusionPipeline
|
||||
|
||||
pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
out = pipe(
|
||||
prompt="a photo of the face of a woman",
|
||||
num_images_per_prompt=1,
|
||||
guidance_scale=7,
|
||||
editing_prompt=[
|
||||
"smiling, smile", # Concepts to apply
|
||||
"glasses, wearing glasses",
|
||||
"curls, wavy hair, curly hair",
|
||||
"beard, full beard, mustache",
|
||||
],
|
||||
reverse_editing_direction=[False, False, False, False], # Direction of guidance i.e. increase all concepts
|
||||
edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
|
||||
edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
|
||||
edit_threshold=[
|
||||
0.99,
|
||||
0.975,
|
||||
0.925,
|
||||
0.96,
|
||||
], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
|
||||
edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
|
||||
edit_mom_beta=0.6, # Momentum beta
|
||||
edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
|
||||
)
|
||||
```
|
||||
|
||||
For more examples check the colab notebook.
|
||||
|
||||
## StableDiffusionSafePipelineOutput
|
||||
[[autodoc]] pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
## SemanticStableDiffusionPipeline
|
||||
[[autodoc]] SemanticStableDiffusionPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,75 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models
|
||||
|
||||
## Overview
|
||||
|
||||
Attend and Excite for Stable Diffusion was proposed in [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://attendandexcite.github.io/Attend-and-Excite/) and provides textual attention control over the image generation.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
|
||||
|
||||
Resources
|
||||
|
||||
* [Project Page](https://attendandexcite.github.io/Attend-and-Excite/)
|
||||
* [Paper](https://arxiv.org/abs/2301.13826)
|
||||
* [Original Code](https://github.com/AttendAndExcite/Attend-and-Excite)
|
||||
* [Demo](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_semantic_stable_diffusion_attend_and_excite.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_semantic_stable_diffusion_attend_and_excite) | *Text-to-Image Generation* | - | https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite
|
||||
|
||||
|
||||
### Usage example
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionAttendAndExcitePipeline
|
||||
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a cat and a frog"
|
||||
|
||||
# use get_indices function to find out indices of the tokens you want to alter
|
||||
pipe.get_indices(prompt)
|
||||
|
||||
token_indices = [2, 5]
|
||||
seed = 6141
|
||||
generator = torch.Generator("cuda").manual_seed(seed)
|
||||
|
||||
images = pipe(
|
||||
prompt=prompt,
|
||||
token_indices=token_indices,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
num_inference_steps=50,
|
||||
max_iter_to_alter=25,
|
||||
).images
|
||||
|
||||
image = images[0]
|
||||
image.save(f"../images/{prompt}_{seed}.png")
|
||||
```
|
||||
|
||||
|
||||
## StableDiffusionAttendAndExcitePipeline
|
||||
[[autodoc]] StableDiffusionAttendAndExcitePipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,167 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text-to-Image Generation with ControlNet Conditioning
|
||||
|
||||
## Overview
|
||||
|
||||
[Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
|
||||
|
||||
Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
|
||||
|
||||
This model was contributed by the amazing community contributor [takuma104](https://huggingface.co/takuma104) ❤️ .
|
||||
|
||||
Resources:
|
||||
|
||||
* [Paper](https://arxiv.org/abs/2302.05543)
|
||||
* [Original Code](https://github.com/lllyasviel/ControlNet)
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
|
||||
|
||||
## Usage example
|
||||
|
||||
In the following we give a simple example of how to use a *ControlNet* checkpoint with Diffusers for inference.
|
||||
The inference pipeline is the same for all pipelines:
|
||||
|
||||
* 1. Take an image and run it through a pre-conditioning processor.
|
||||
* 2. Run the pre-processed image through the [`StableDiffusionControlNetPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Canny Edge ControlNet](https://huggingface.co/lllyasviel/sd-controlnet-canny).
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionControlNetPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Let's load the popular vermeer image
|
||||
image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Next, we process the image to get the canny image. This is step *1.* - running the pre-conditioning processor. The pre-conditioning processor is different for every ControlNet. Please see the model cards of the [official checkpoints](#controlnet-with-stable-diffusion-1.5) for more information about other models.
|
||||
|
||||
First, we need to install opencv:
|
||||
|
||||
```
|
||||
pip install opencv-contrib-python
|
||||
```
|
||||
|
||||
Next, let's also install all required Hugging Face libraries:
|
||||
|
||||
```
|
||||
pip install diffusers transformers git+https://github.com/huggingface/accelerate.git
|
||||
```
|
||||
|
||||
Then we can retrieve the canny edges of the image.
|
||||
|
||||
```python
|
||||
import cv2
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
```
|
||||
|
||||
Let's take a look at the processed image.
|
||||
|
||||

|
||||
|
||||
Now, we load the official [Stable Diffusion 1.5 Model](runwayml/stable-diffusion-v1-5) as well as the ControlNet for canny edges.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
import torch
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
```
|
||||
|
||||
To speed-up things and reduce memory, let's enable model offloading and use the fast [`UniPCMultistepScheduler`].
|
||||
|
||||
```py
|
||||
from diffusers import UniPCMultistepScheduler
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# this command loads the individual model components on GPU on-demand.
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Finally, we can run the pipeline:
|
||||
|
||||
```py
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
out_image = pipe(
|
||||
"disco dancer with colorful lights", num_inference_steps=20, generator=generator, image=canny_image
|
||||
).images[0]
|
||||
```
|
||||
|
||||
This should take only around 3-4 seconds on GPU (depending on hardware). The output image then looks as follows:
|
||||
|
||||

|
||||
|
||||
|
||||
**Note**: To see how to run all other ControlNet checkpoints, please have a look at [ControlNet with Stable Diffusion 1.5](#controlnet-with-stable-diffusion-1.5)
|
||||
|
||||
<!-- TODO: add space -->
|
||||
|
||||
## Available checkpoints
|
||||
|
||||
ControlNet requires a *control image* in addition to the text-to-image *prompt*.
|
||||
Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
|
||||
|
||||
All checkpoints can be found under the authors' namespace [lllyasviel](https://huggingface.co/lllyasviel).
|
||||
|
||||
### ControlNet with Stable Diffusion 1.5
|
||||
|
||||
| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
|
||||
|---|---|---|---|
|
||||
|[lllyasviel/sd-controlnet-canny](https://huggingface.co/lllyasviel/sd-controlnet-canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-depth](https://huggingface.co/lllyasviel/sd-controlnet-depth)<br/> *Trained with Midas depth estimation* |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-hed](https://huggingface.co/lllyasviel/sd-controlnet-hed)<br/> *Trained with HED edge detection (soft edge)* |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
|
||||
|[lllyasviel/sd-controlnet-mlsd](https://huggingface.co/lllyasviel/sd-controlnet-mlsd)<br/> *Trained with M-LSD line detection* |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-normal](https://huggingface.co/lllyasviel/sd-controlnet-normal)<br/> *Trained with normal map* |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-openpose](https://huggingface.co/lllyasviel/sd-controlnet_openpose)<br/> *Trained with OpenPose bone image* |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-scribble](https://huggingface.co/lllyasviel/sd-controlnet_scribble)<br/> *Trained with human scribbles* |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
|
||||
|[lllyasviel/sd-controlnet-seg](https://huggingface.co/lllyasviel/sd-controlnet_seg)<br/>*Trained with semantic segmentation* |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
|
||||
|
||||
## StableDiffusionControlNetPipeline
|
||||
[[autodoc]] StableDiffusionControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,33 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Diffusion Latent Upscaler
|
||||
|
||||
## StableDiffusionLatentUpscalePipeline
|
||||
|
||||
The Stable Diffusion Latent Upscaler model was created by [Katherine Crowson](https://github.com/crowsonkb/k-diffusion) in collaboration with [Stability AI](https://stability.ai/). It can be used on top of any [`StableDiffusionUpscalePipeline`] checkpoint to enhance its output image resolution by a factor of 2.
|
||||
|
||||
A notebook that demonstrates the original implementation can be found here:
|
||||
- [Stable Diffusion Upscaler Demo](https://colab.research.google.com/drive/1o1qYJcFeywzCIdkfKJy7cTpgZTCM2EI4)
|
||||
|
||||
Available Checkpoints are:
|
||||
- *stabilityai/latent-upscaler*: [stabilityai/sd-x2-latent-upscaler](https://huggingface.co/stabilityai/sd-x2-latent-upscaler)
|
||||
|
||||
|
||||
[[autodoc]] StableDiffusionLatentUpscalePipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_sequential_cpu_offload
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,58 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation
|
||||
|
||||
## Overview
|
||||
|
||||
[MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation](https://arxiv.org/abs/2302.08113) by Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Recent advances in text-to-image generation with diffusion models present transformative capabilities in image quality. However, user controllability of the generated image, and fast adaptation to new tasks still remains an open challenge, currently mostly addressed by costly and long re-training and fine-tuning or ad-hoc adaptations to specific image generation tasks. In this work, we present MultiDiffusion, a unified framework that enables versatile and controllable image generation, using a pre-trained text-to-image diffusion model, without any further training or finetuning. At the center of our approach is a new generation process, based on an optimization task that binds together multiple diffusion generation processes with a shared set of parameters or constraints. We show that MultiDiffusion can be readily applied to generate high quality and diverse images that adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes.
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://multidiffusion.github.io/).
|
||||
* [Paper](https://arxiv.org/abs/2302.08113).
|
||||
* [Original Code](https://github.com/omerbt/MultiDiffusion).
|
||||
* [Demo](https://huggingface.co/spaces/weizmannscience/MultiDiffusion).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionPanoramaPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py) | *Text-Guided Panorama View Generation* | [🤗 Space](https://huggingface.co/spaces/weizmannscience/MultiDiffusion)) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
|
||||
|
||||
model_ckpt = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, torch_dtype=torch.float16)
|
||||
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of the dolomites"
|
||||
image = pipe(prompt).images[0]
|
||||
image.save("dolomites.png")
|
||||
```
|
||||
|
||||
## StableDiffusionPanoramaPipeline
|
||||
[[autodoc]] StableDiffusionPanoramaPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,70 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# InstructPix2Pix: Learning to Follow Image Editing Instructions
|
||||
|
||||
## Overview
|
||||
|
||||
[InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) by Tim Brooks, Aleksander Holynski and Alexei A. Efros.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We propose a method for editing images from human instructions: given an input image and a written instruction that tells the model what to do, our model follows these instructions to edit the image. To obtain training data for this problem, we combine the knowledge of two large pretrained models -- a language model (GPT-3) and a text-to-image model (Stable Diffusion) -- to generate a large dataset of image editing examples. Our conditional diffusion model, InstructPix2Pix, is trained on our generated data, and generalizes to real images and user-written instructions at inference time. Since it performs edits in the forward pass and does not require per example fine-tuning or inversion, our model edits images quickly, in a matter of seconds. We show compelling editing results for a diverse collection of input images and written instructions.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://www.timothybrooks.com/instruct-pix2pix).
|
||||
* [Paper](https://arxiv.org/abs/2211.09800).
|
||||
* [Original Code](https://github.com/timothybrooks/instruct-pix2pix).
|
||||
* [Demo](https://huggingface.co/spaces/timbrooks/instruct-pix2pix).
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionInstructPix2PixPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
|
||||
model_id = "timbrooks/instruct-pix2pix"
|
||||
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
|
||||
|
||||
|
||||
def download_image(url):
|
||||
image = PIL.Image.open(requests.get(url, stream=True).raw)
|
||||
image = PIL.ImageOps.exif_transpose(image)
|
||||
image = image.convert("RGB")
|
||||
return image
|
||||
|
||||
|
||||
image = download_image(url)
|
||||
|
||||
prompt = "make the mountains snowy"
|
||||
images = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images
|
||||
images[0].save("snowy_mountains.png")
|
||||
```
|
||||
|
||||
## StableDiffusionInstructPix2PixPipeline
|
||||
[[autodoc]] StableDiffusionInstructPix2PixPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,291 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Zero-shot Image-to-Image Translation
|
||||
|
||||
## Overview
|
||||
|
||||
[Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027).
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Large-scale text-to-image generative models have shown their remarkable ability to synthesize diverse and high-quality images. However, it is still challenging to directly apply these models for editing real images for two reasons. First, it is hard for users to come up with a perfect text prompt that accurately describes every visual detail in the input image. Second, while existing models can introduce desirable changes in certain regions, they often dramatically alter the input content and introduce unexpected changes in unwanted regions. In this work, we propose pix2pix-zero, an image-to-image translation method that can preserve the content of the original image without manual prompting. We first automatically discover editing directions that reflect desired edits in the text embedding space. To preserve the general content structure after editing, we further propose cross-attention guidance, which aims to retain the cross-attention maps of the input image throughout the diffusion process. In addition, our method does not need additional training for these edits and can directly use the existing pre-trained text-to-image diffusion model. We conduct extensive experiments and show that our method outperforms existing and concurrent works for both real and synthetic image editing.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://pix2pixzero.github.io/).
|
||||
* [Paper](https://arxiv.org/abs/2302.03027).
|
||||
* [Original Code](https://github.com/pix2pixzero/pix2pix-zero).
|
||||
* [Demo](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo).
|
||||
|
||||
## Tips
|
||||
|
||||
* The pipeline can be conditioned on real input images. Check out the code examples below to know more.
|
||||
* The pipeline exposes two arguments namely `source_embeds` and `target_embeds`
|
||||
that let you control the direction of the semantic edits in the final image to be generated. Let's say,
|
||||
you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
|
||||
this in the pipeline, you simply have to set the embeddings related to the phrases including "cat" to
|
||||
`source_embeds` and "dog" to `target_embeds`. Refer to the code example below for more details.
|
||||
* When you're using this pipeline from a prompt, specify the _source_ concept in the prompt. Taking
|
||||
the above example, a valid input prompt would be: "a high resolution painting of a **cat** in the style of van gough".
|
||||
* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
|
||||
* Swap the `source_embeds` and `target_embeds`.
|
||||
* Change the input prompt to include "dog".
|
||||
* To learn more about how the source and target embeddings are generated, refer to the [original
|
||||
paper](https://arxiv.org/abs/2302.03027). Below, we also provide some directions on how to generate the embeddings.
|
||||
* Note that the quality of the outputs generated with this pipeline is dependent on how good the `source_embeds` and `target_embeds` are. Please, refer to [this discussion](#generating-source-and-target-embeddings) for some suggestions on the topic.
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionPix2PixZeroPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
### Based on an image generated with the input prompt
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
|
||||
def download(embedding_url, local_filepath):
|
||||
r = requests.get(embedding_url)
|
||||
with open(local_filepath, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
|
||||
model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
model_ckpt, conditions_input_image=False, torch_dtype=torch.float16
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.to("cuda")
|
||||
|
||||
prompt = "a high resolution painting of a cat in the style of van gogh"
|
||||
src_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/cat.pt"
|
||||
target_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/dog.pt"
|
||||
|
||||
for url in [src_embs_url, target_embs_url]:
|
||||
download(url, url.split("/")[-1])
|
||||
|
||||
src_embeds = torch.load(src_embs_url.split("/")[-1])
|
||||
target_embeds = torch.load(target_embs_url.split("/")[-1])
|
||||
|
||||
images = pipeline(
|
||||
prompt,
|
||||
source_embeds=src_embeds,
|
||||
target_embeds=target_embeds,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
).images
|
||||
images[0].save("edited_image_dog.png")
|
||||
```
|
||||
|
||||
### Based on an input image
|
||||
|
||||
When the pipeline is conditioned on an input image, we first obtain an inverted
|
||||
noise from it using a `DDIMInverseScheduler` with the help of a generated caption. Then
|
||||
the inverted noise is used to start the generation process.
|
||||
|
||||
First, let's load our pipeline:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import BlipForConditionalGeneration, BlipProcessor
|
||||
from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
captioner_id = "Salesforce/blip-image-captioning-base"
|
||||
processor = BlipProcessor.from_pretrained(captioner_id)
|
||||
model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
||||
|
||||
sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
sd_model_ckpt,
|
||||
caption_generator=model,
|
||||
caption_processor=processor,
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None,
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Then, we load an input image for conditioning and obtain a suitable caption for it:
|
||||
|
||||
```py
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
|
||||
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
|
||||
caption = pipeline.generate_caption(raw_image)
|
||||
```
|
||||
|
||||
Then we employ the generated caption and the input image to get the inverted noise:
|
||||
|
||||
```py
|
||||
generator = torch.manual_seed(0)
|
||||
inv_latents = pipeline.invert(caption, image=raw_image, generator=generator).latents
|
||||
```
|
||||
|
||||
Now, generate the image with edit directions:
|
||||
|
||||
```py
|
||||
# See the "Generating source and target embeddings" section below to
|
||||
# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
|
||||
source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
|
||||
target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
|
||||
|
||||
source_embeds = pipeline.get_embeds(source_prompts, batch_size=2)
|
||||
target_embeds = pipeline.get_embeds(target_prompts, batch_size=2)
|
||||
|
||||
|
||||
image = pipeline(
|
||||
caption,
|
||||
source_embeds=source_embeds,
|
||||
target_embeds=target_embeds,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
generator=generator,
|
||||
latents=inv_latents,
|
||||
negative_prompt=caption,
|
||||
).images[0]
|
||||
image.save("edited_image.png")
|
||||
```
|
||||
|
||||
## Generating source and target embeddings
|
||||
|
||||
The authors originally used the [GPT-3 API](https://openai.com/api/) to generate the source and target captions for discovering
|
||||
edit directions. However, we can also leverage open source and public models for the same purpose.
|
||||
Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
|
||||
for generating captions and [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for
|
||||
computing embeddings on the generated captions.
|
||||
|
||||
**1. Load the generation model**:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5ForConditionalGeneration
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
|
||||
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
**2. Construct a starting prompt**:
|
||||
|
||||
```py
|
||||
source_concept = "cat"
|
||||
target_concept = "dog"
|
||||
|
||||
source_text = f"Provide a caption for images containing a {source_concept}. "
|
||||
"The captions should be in English and should be no longer than 150 characters."
|
||||
|
||||
target_text = f"Provide a caption for images containing a {target_concept}. "
|
||||
"The captions should be in English and should be no longer than 150 characters."
|
||||
```
|
||||
|
||||
Here, we're interested in the "cat -> dog" direction.
|
||||
|
||||
**3. Generate captions**:
|
||||
|
||||
We can use a utility like so for this purpose.
|
||||
|
||||
```py
|
||||
def generate_captions(input_prompt):
|
||||
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
|
||||
|
||||
outputs = model.generate(
|
||||
input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
|
||||
)
|
||||
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
```
|
||||
|
||||
And then we just call it to generate our captions:
|
||||
|
||||
```py
|
||||
source_captions = generate_captions(source_text)
|
||||
target_captions = generate_captions(target_concept)
|
||||
```
|
||||
|
||||
We encourage you to play around with the different parameters supported by the
|
||||
`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
|
||||
|
||||
**4. Load the embedding model**:
|
||||
|
||||
Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = pipeline.to("cuda")
|
||||
tokenizer = pipeline.tokenizer
|
||||
text_encoder = pipeline.text_encoder
|
||||
```
|
||||
|
||||
**5. Compute embeddings**:
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
def embed_captions(sentences, tokenizer, text_encoder, device="cuda"):
|
||||
with torch.no_grad():
|
||||
embeddings = []
|
||||
for sent in sentences:
|
||||
text_inputs = tokenizer(
|
||||
sent,
|
||||
padding="max_length",
|
||||
max_length=tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
|
||||
embeddings.append(prompt_embeds)
|
||||
return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
|
||||
|
||||
source_embeddings = embed_captions(source_captions, tokenizer, text_encoder)
|
||||
target_embeddings = embed_captions(target_captions, tokenizer, text_encoder)
|
||||
```
|
||||
|
||||
And you're done! [Here](https://colab.research.google.com/drive/1tz2C1EdfZYAPlzXXbTnf-5PRBiR8_R1F?usp=sharing) is a Colab Notebook that you can use to interact with the entire process.
|
||||
|
||||
Now, you can use these embeddings directly while calling the pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import DDIMScheduler
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
images = pipeline(
|
||||
prompt,
|
||||
source_embeds=source_embeddings,
|
||||
target_embeds=target_embeddings,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
).images
|
||||
images[0].save("edited_image_dog.png")
|
||||
```
|
||||
|
||||
## StableDiffusionPix2PixZeroPipeline
|
||||
[[autodoc]] StableDiffusionPix2PixZeroPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,64 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Self-Attention Guidance (SAG)
|
||||
|
||||
## Overview
|
||||
|
||||
[Self-Attention Guidance](https://arxiv.org/abs/2210.00939) by Susung Hong et al.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Denoising diffusion models (DDMs) have been drawing much attention for their appreciable sample quality and diversity. Despite their remarkable performance, DDMs remain black boxes on which further study is necessary to take a profound step. Motivated by this, we delve into the design of conventional U-shaped diffusion models. More specifically, we investigate the self-attention modules within these models through carefully designed experiments and explore their characteristics. In addition, inspired by the studies that substantiate the effectiveness of the guidance schemes, we present plug-and-play diffusion guidance, namely Self-Attention Guidance (SAG), that can drastically boost the performance of existing diffusion models. Our method, SAG, extracts the intermediate attention map from a diffusion model at every iteration and selects tokens above a certain attention score for masking and blurring to obtain a partially blurred input. Subsequently, we measure the dissimilarity between the predicted noises obtained from feeding the blurred and original input to the diffusion model and leverage it as guidance. With this guidance, we observe apparent improvements in a wide range of diffusion models, e.g., ADM, IDDPM, and Stable Diffusion, and show that the results further improve by combining our method with the conventional guidance scheme. We provide extensive ablation studies to verify our choices.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://ku-cvlab.github.io/Self-Attention-Guidance).
|
||||
* [Paper](https://arxiv.org/abs/2210.00939).
|
||||
* [Original Code](https://github.com/KU-CVLAB/Self-Attention-Guidance).
|
||||
* [Demo](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb).
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionSAGPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py) | *Text-to-Image Generation* | [Colab](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb) |
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionSAGPipeline
|
||||
from accelerate.utils import set_seed
|
||||
|
||||
pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
seed = 8978
|
||||
prompt = "."
|
||||
guidance_scale = 7.5
|
||||
num_images_per_prompt = 1
|
||||
|
||||
sag_scale = 1.0
|
||||
|
||||
set_seed(seed)
|
||||
images = pipe(
|
||||
prompt, num_images_per_prompt=num_images_per_prompt, guidance_scale=guidance_scale, sag_scale=sag_scale
|
||||
).images
|
||||
images[0].save("example.png")
|
||||
```
|
||||
|
||||
## StableDiffusionSAGPipeline
|
||||
[[autodoc]] StableDiffusionSAGPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,97 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable unCLIP
|
||||
|
||||
Stable unCLIP checkpoints are finetuned from [stable diffusion 2.1](./stable_diffusion_2) checkpoints to condition on CLIP image embeddings.
|
||||
Stable unCLIP also still conditions on text embeddings. Given the two separate conditionings, stable unCLIP can be used
|
||||
for text guided image variation. When combined with an unCLIP prior, it can also be used for full text to image generation.
|
||||
|
||||
## Tips
|
||||
|
||||
Stable unCLIP takes a `noise_level` as input during inference. `noise_level` determines how much noise is added
|
||||
to the image embeddings. A higher `noise_level` increases variation in the final un-noised images. By default,
|
||||
we do not add any additional noise to the image embeddings i.e. `noise_level = 0`.
|
||||
|
||||
### Available checkpoints:
|
||||
|
||||
TODO
|
||||
|
||||
### Text-to-Image Generation
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableUnCLIPPipeline
|
||||
|
||||
pipe = StableUnCLIPPipeline.from_pretrained(
|
||||
"fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
|
||||
) # TODO update model path
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
images = pipe(prompt).images
|
||||
images[0].save("astronaut_horse.png")
|
||||
```
|
||||
|
||||
|
||||
### Text guided Image-to-Image Variation
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
from diffusers import StableUnCLIPImg2ImgPipeline
|
||||
|
||||
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
|
||||
"fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
|
||||
) # TODO update model path
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt, init_image).images
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
### StableUnCLIPPipeline
|
||||
|
||||
[[autodoc]] StableUnCLIPPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
|
||||
### StableUnCLIPImg2ImgPipeline
|
||||
|
||||
[[autodoc]] StableUnCLIPImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Inverse Denoising Diffusion Implicit Models (DDIMInverse)
|
||||
|
||||
## Overview
|
||||
|
||||
This scheduler is the inverted scheduler of [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
|
||||
The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf)
|
||||
|
||||
## DDIMInverseScheduler
|
||||
[[autodoc]] DDIMInverseScheduler
|
||||
@@ -1,22 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DEIS
|
||||
|
||||
Fast Sampling of Diffusion Models with Exponential Integrator.
|
||||
|
||||
## Overview
|
||||
|
||||
Original paper can be found [here](https://arxiv.org/abs/2204.13902). The original implementation can be found [here](https://github.com/qsh-zh/deis).
|
||||
|
||||
## DEISMultistepScheduler
|
||||
[[autodoc]] DEISMultistepScheduler
|
||||
@@ -1,24 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# UniPC
|
||||
|
||||
## Overview
|
||||
|
||||
UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.
|
||||
|
||||
For more details about the method, please refer to the [[paper]](https://arxiv.org/abs/2302.04867) and the [[code]](https://github.com/wl-zhao/UniPC).
|
||||
|
||||
Fast Sampling of Diffusion Models with Exponential Integrator.
|
||||
|
||||
## UniPCMultistepScheduler
|
||||
[[autodoc]] UniPCMultistepScheduler
|
||||
@@ -1,49 +0,0 @@
|
||||
# 🧨 Diffusers’ Ethical Guidelines
|
||||
|
||||
## Preamble
|
||||
|
||||
[Diffusers](https://huggingface.co/docs/diffusers/index) provides pre-trained diffusion models and serves as a modular toolbox for inference and training.
|
||||
|
||||
Given its real case applications in the world and potential negative impacts on society, we think it is important to provide the project with ethical guidelines to guide the development, users’ contributions, and usage of the Diffusers library.
|
||||
|
||||
The risks associated with using this technology are still being examined, but to name a few: copyrights issues for artists; deep-fake exploitation; sexual content generation in inappropriate contexts; non-consensual impersonation; harmful social biases perpetuating the oppression of marginalized groups.
|
||||
We will keep tracking risks and adapt the following guidelines based on the community's responsiveness and valuable feedback.
|
||||
|
||||
|
||||
## Scope
|
||||
|
||||
The Diffusers community will apply the following ethical guidelines to the project’s development and help coordinate how the community will integrate the contributions, especially concerning sensitive topics related to ethical concerns.
|
||||
|
||||
|
||||
## Ethical guidelines
|
||||
|
||||
The following ethical guidelines apply generally, but we will primarily implement them when dealing with ethically sensitive issues while making a technical choice. Furthermore, we commit to adapting those ethical principles over time following emerging harms related to the state of the art of the technology in question.
|
||||
|
||||
- **Transparency**: we are committed to being transparent in managing PRs, explaining our choices to users, and making technical decisions.
|
||||
|
||||
- **Consistency**: we are committed to guaranteeing our users the same level of attention in project management, keeping it technically stable and consistent.
|
||||
|
||||
- **Simplicity**: with a desire to make it easy to use and exploit the Diffusers library, we are committed to keeping the project’s goals lean and coherent.
|
||||
|
||||
- **Accessibility**: the Diffusers project helps lower the entry bar for contributors who can help run it even without technical expertise. Doing so makes research artifacts more accessible to the community.
|
||||
|
||||
- **Reproducibility**: we aim to be transparent about the reproducibility of upstream code, models, and datasets when made available through the Diffusers library.
|
||||
|
||||
- **Responsibility**: as a community and through teamwork, we hold a collective responsibility to our users by anticipating and mitigating this technology's potential risks and dangers.
|
||||
|
||||
|
||||
## Examples of implementations: Safety features and Mechanisms
|
||||
|
||||
The team works daily to make the technical and non-technical tools available to deal with the potential ethical and social risks associated with diffusion technology. Moreover, the community's input is invaluable in ensuring these features' implementation and raising awareness with us.
|
||||
|
||||
- [**Community tab**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): it enables the community to discuss and better collaborate on a project.
|
||||
|
||||
- **Bias exploration and evaluation**: the Hugging Face team provides a [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer) to demonstrate the biases in Stable Diffusion interactively. In this sense, we support and encourage bias explorers and evaluations.
|
||||
|
||||
- **Encouraging safety in deployment**
|
||||
|
||||
- [**Safe Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_safe): It mitigates the well-known issue that models, like Stable Diffusion, that are trained on unfiltered, web-crawled datasets tend to suffer from inappropriate degeneration. Related paper: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105).
|
||||
|
||||
- **Staged released on the Hub**: in particularly sensitive situations, access to some repositories should be restricted. This staged release is an intermediary step that allows the repository’s authors to have more control over its use.
|
||||
|
||||
- **Licensing**: [OpenRAILs](https://huggingface.co/blog/open_rail), a new type of licensing, allow us to ensure free access while having a set of restrictions that ensure more responsible use.
|
||||
@@ -1,110 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Philosophy
|
||||
|
||||
🧨 Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
|
||||
Its purpose is to serve as a **modular toolbox** for both inference and training.
|
||||
|
||||
We aim at building a library that stands the test of time and therefore take API design very seriously.
|
||||
|
||||
In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:
|
||||
|
||||
## Usability over Performance
|
||||
|
||||
- While Diffusers has many built-in performance-enhancing features (see [Memory and Speed](https://huggingface.co/docs/diffusers/optimization/fp16)), models are always loaded with the highest precision and lowest optimization. Therefore, by default diffusion pipelines are always instantiated on CPU with float32 precision if not otherwise defined by the user. This ensures usability across different platforms and accelerators and means that no complex installations are required to run the library.
|
||||
- Diffusers aim at being a **light-weight** package and therefore has very few required dependencies, but many soft dependencies that can improve performance (such as `accelerate`, `safetensors`, `onnx`, etc...). We strive to keep the library as lightweight as possible so that it can be added without much concern as a dependency on other packages.
|
||||
- Diffusers prefers simple, self-explainable code over condensed, magic code. This means that short-hand code syntaxes such as lambda functions, and advanced PyTorch operators are often not desired.
|
||||
|
||||
## Simple over easy
|
||||
|
||||
As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library:
|
||||
- We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management.
|
||||
- Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible.
|
||||
- Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers.
|
||||
- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training
|
||||
is very simple thanks to diffusers' ability to separate single components of the diffusion pipeline.
|
||||
|
||||
## Tweakable, contributor-friendly over abstraction
|
||||
|
||||
For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself).
|
||||
In short, just like Transformers does for modeling files, diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers.
|
||||
Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable.
|
||||
**However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because:
|
||||
- Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions.
|
||||
- Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions.
|
||||
- Open-source libraries rely on community contributions and therefore must build a library that is easy to contribute to. The more abstract the code, the more dependencies, the harder to read, and the harder to contribute to. Contributors simply stop contributing to very abstract libraries out of fear of breaking vital functionality. If contributing to a library cannot break other fundamental code, not only is it more inviting for potential new contributors, but it is also easier to review and contribute to multiple parts in parallel.
|
||||
|
||||
At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look
|
||||
at [this blog post](https://huggingface.co/blog/transformers-design-philosophy).
|
||||
|
||||
In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such
|
||||
as [DDPM](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [UnCLIP (Dalle-2)](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/unclip#overview) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models#diffusers.UNet2DConditionModel).
|
||||
|
||||
Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗.
|
||||
We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️ to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=).
|
||||
|
||||
## Design Philosophy in Details
|
||||
|
||||
Now, let's look a bit into the nitty-gritty details of the design philosophy. Diffusers essentially consist of three major classes, [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines), [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models), and [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
|
||||
Let's walk through more in-detail design decisions for each class.
|
||||
|
||||
### Pipelines
|
||||
|
||||
Pipelines are designed to be easy to use (therefore do not follow [*Simple over easy*](#simple-over-easy) 100%)), are not feature complete, and should loosely be seen as examples of how to use [models](#models) and [schedulers](#schedulers) for inference.
|
||||
|
||||
The following design principles are followed:
|
||||
- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [#Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
|
||||
- Pipelines all inherit from [`DiffusionPipeline`]
|
||||
- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
|
||||
- Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
|
||||
- Pipelines should be used **only** for inference.
|
||||
- Pipelines should be very readable, self-explanatory, and easy to tweak.
|
||||
- Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
|
||||
- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner)
|
||||
- Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
|
||||
- Pipelines should be named after the task they are intended to solve.
|
||||
- In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
|
||||
|
||||
### Models
|
||||
|
||||
Models are designed as configurable toolboxes that are natural extensions of [PyTorch's Module class](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). They only partly follow the **single-file policy**.
|
||||
|
||||
The following design principles are followed:
|
||||
- Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
|
||||
- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc...
|
||||
- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
|
||||
- Models intend to expose complexity, just like PyTorch's module does, and give clear error messages.
|
||||
- Models all inherit from `ModelMixin` and `ConfigMixin`.
|
||||
- Models can be optimized for performance when it doesn’t demand major code changes, keeps backward compatibility, and gives significant memory or compute gain.
|
||||
- Models should by default have the highest precision and lowest performance setting.
|
||||
- To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
|
||||
- Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
|
||||
- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and
|
||||
readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
||||
|
||||
### Schedulers
|
||||
|
||||
Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**.
|
||||
|
||||
The following design principles are followed:
|
||||
- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
|
||||
- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained.
|
||||
- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper).
|
||||
- If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism.
|
||||
- Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`.
|
||||
- Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./using-diffusers/schedulers.mdx).
|
||||
- Every scheduler has to have a `set_num_inference_steps`, and a `step` function. `set_num_inference_steps(...)` has to be called before every denoising process, *i.e.* before `step(...)` is called.
|
||||
- Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon
|
||||
- The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
|
||||
- Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
|
||||
- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
|
||||
@@ -1,76 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://raw.githubusercontent.com/huggingface/diffusers/77aadfee6a891ab9fcfb780f87c693f7a5beeb8e/docs/source/imgs/diffusers_library.jpg" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
# 🧨 Diffusers
|
||||
|
||||
🤗 Diffusers provides pretrained vision and audio diffusion models, and serves as a modular toolbox for inference and training.
|
||||
|
||||
More precisely, 🤗 Diffusers offers:
|
||||
|
||||
- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [**Using Diffusers**](./using-diffusers/conditional_image_generation)) or have a look at [**Pipelines**](#pipelines) to get an overview of all supported pipelines and their corresponding papers.
|
||||
- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers/overview).
|
||||
- Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system. See [**Models**](./api/models) for more details
|
||||
- Training examples to show how to train the most popular diffusion model tasks. For more information see [**Training**](./training/overview).
|
||||
|
||||
## 🧨 Diffusers Pipelines
|
||||
|
||||
The following table summarizes all officially supported pipelines, their corresponding paper, and if
|
||||
available a colab notebook to directly try them out.
|
||||
|
||||
| Pipeline | Paper | Tasks | Colab
|
||||
|---|---|:---:|:---:|
|
||||
| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
|
||||
| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
|
||||
| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [
|
||||
| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
|
||||
| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
|
||||
| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
|
||||
| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb)
|
||||
| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
|
||||
| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing|
|
||||
| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
|
||||
| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation |
|
||||
| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation |
|
||||
| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
|
||||
| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
|
||||
| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
|
||||
| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
|
||||
**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
|
||||
@@ -1,208 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Accelerated PyTorch 2.0 support in Diffusers
|
||||
|
||||
Starting from version `0.13.0`, Diffusers supports the latest optimization from the upcoming [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) release. These include:
|
||||
1. Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies required.
|
||||
2. [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) support for extra performance boost when individual models are compiled.
|
||||
|
||||
|
||||
## Installation
|
||||
To benefit from the accelerated transformers implementation and `torch.compile`, we will need to install the nightly version of PyTorch, as the stable version is yet to be released. The first step is to install CUDA 11.7 or CUDA 11.8,
|
||||
as PyTorch 2.0 does not support the previous versions. Once CUDA is installed, torch nightly can be installed using:
|
||||
|
||||
```bash
|
||||
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu117
|
||||
```
|
||||
|
||||
## Using accelerated transformers and torch.compile.
|
||||
|
||||
|
||||
1. **Accelerated Transformers implementation**
|
||||
|
||||
PyTorch 2.0 includes an optimized and memory-efficient attention implementation through the [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) function, which automatically enables several optimizations depending on the inputs and the GPU type. This is similar to the `memory_efficient_attention` from [xFormers](https://github.com/facebookresearch/xformers), but built natively into PyTorch.
|
||||
|
||||
These optimizations will be enabled by default in Diffusers if PyTorch 2.0 is installed and if `torch.nn.functional.scaled_dot_product_attention` is available. To use it, just install `torch 2.0` as suggested above and simply use the pipeline. For example:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
If you want to enable it explicitly (which is not required), you can do so as shown below.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.models.cross_attention import AttnProcessor2_0
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
pipe.unet.set_attn_processor(AttnProcessor2_0())
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
This should be as fast and memory efficient as `xFormers`. More details [in our benchmark](#benchmark).
|
||||
|
||||
|
||||
2. **torch.compile**
|
||||
|
||||
To get an additional speedup, we can use the new `torch.compile` feature. To do so, we simply wrap our `unet` with `torch.compile`. For more information and different options, refer to the
|
||||
[torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
|
||||
"cuda"
|
||||
)
|
||||
pipe.unet = torch.compile(pipe.unet)
|
||||
|
||||
batch_size = 10
|
||||
prompt = "A photo of an astronaut riding a horse on marse."
|
||||
images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images
|
||||
```
|
||||
|
||||
Depending on the type of GPU, `compile()` can yield between 2-9% of _additional speed-up_ over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).
|
||||
|
||||
Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.
|
||||
|
||||
|
||||
## Benchmark
|
||||
|
||||
We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, `torch.nn.functional.scaled_dot_product_attention` and `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
|
||||
For the benchmark we used the the [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) model with 50 steps. The `xFormers` benchmark is done using the `torch==1.13.1` version, while the accelerated transformers optimizations are tested using nightly versions of PyTorch 2.0. The tables below summarize the results we got.
|
||||
|
||||
The `Speed over xformers` columns denote the speed-up gained over `xFormers` using the `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
|
||||
|
||||
|
||||
### FP16 benchmark
|
||||
|
||||
The table below shows the benchmark results for inference using `fp16`. As we can see, `torch.nn.functional.scaled_dot_product_attention` is as fast as `xFormers` (sometimes slightly faster/slower) on all the GPUs we tested.
|
||||
And using `torch.compile` gives further speed-up of up of 10% over `xFormers`, but it's mostly noticeable on the A100 GPU.
|
||||
|
||||
___The time reported is in seconds.___
|
||||
|
||||
| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| A100 | 10 | 12.02 | 8.7 | 8.79 | 7.89 | 9.31 |
|
||||
| A100 | 16 | 18.95 | 13.57 | 13.67 | 12.25 | 9.73 |
|
||||
| A100 | 32 (1) | OOM | 26.56 | 26.68 | 24.08 | 9.34 |
|
||||
| A100 | 64 | | 52.51 | 53.03 | 47.81 | 8.95 |
|
||||
| | | | | | | |
|
||||
| A10 | 4 | 13.94 | 9.81 | 10.01 | 9.35 | 4.69 |
|
||||
| A10 | 8 | 27.09 | 19 | 19.53 | 18.33 | 3.53 |
|
||||
| A10 | 10 | 33.69 | 23.53 | 24.19 | 22.52 | 4.29 |
|
||||
| A10 | 16 | OOM | 37.55 | 38.31 | 36.81 | 1.97 |
|
||||
| A10 | 32 (1) | | 77.19 | 78.43 | 76.64 | 0.71 |
|
||||
| A10 | 64 (1) | | 173.59 | 158.99 | 155.14 | 10.63 |
|
||||
| | | | | | | |
|
||||
| T4 | 4 | 38.81 | 30.09 | 29.74 | 27.55 | 8.44 |
|
||||
| T4 | 8 | OOM | 55.71 | 55.99 | 53.85 | 3.34 |
|
||||
| T4 | 10 | OOM | 68.96 | 69.86 | 65.35 | 5.23 |
|
||||
| T4 | 16 | OOM | 111.47 | 113.26 | 106.93 | 4.07 |
|
||||
| | | | | | | |
|
||||
| V100 | 4 | 9.84 | 8.16 | 8.09 | 7.65 | 6.25 |
|
||||
| V100 | 8 | OOM | 15.62 | 15.44 | 14.59 | 6.59 |
|
||||
| V100 | 10 | OOM | 19.52 | 19.28 | 18.18 | 6.86 |
|
||||
| V100 | 16 | OOM | 30.29 | 29.84 | 28.22 | 6.83 |
|
||||
| | | | | | | |
|
||||
| 3090 | 4 | 10.04 | 7.82 | 7.89 | 7.47 | 4.48 |
|
||||
| 3090 | 8 | 19.27 | 14.97 | 15.04 | 14.22 | 5.01 |
|
||||
| 3090 | 10| 24.08 | 18.7 | 18.7 | 17.69 | 5.40 |
|
||||
| 3090 | 16 | OOM | 29.06 | 29.06 | 28.2 | 2.96 |
|
||||
| 3090 | 32 (1) | | 58.05 | 58 | 54.88 | 5.46 |
|
||||
| 3090 | 64 (1) | | 126.54 | 126.03 | 117.33 | 7.28 |
|
||||
| | | | | | | |
|
||||
| 3090 Ti | 4 | 9.07 | 7.14 | 7.15 | 6.81 | 4.62 |
|
||||
| 3090 Ti | 8 | 17.51 | 13.65 | 13.72 | 12.99 | 4.84 |
|
||||
| 3090 Ti | 10 (2) | 21.79 | 16.85 | 16.93 | 16.02 | 4.93 |
|
||||
| 3090 Ti | 16 | OOM | 26.1 | 26.28 | 25.46 | 2.45 |
|
||||
| 3090 Ti | 32 (1) | | 51.78 | 52.04 | 49.15 | 5.08 |
|
||||
| 3090 Ti | 64 (1) | | 112.02 | 112.33 | 103.91 | 7.24 |
|
||||
| | | | | | | |
|
||||
| 4090 | 4 | 10.48 | 8.37 | 8.32 | 8.01 | 4.30 |
|
||||
| 4090 | 8 | 14.33 | 10.22 | 10.42 | 9.78 | 4.31 |
|
||||
| 4090 | 16 | | 17.07 | 17.46 | 17.15 | -0.47 |
|
||||
| 4090 | 32 (1) | | 39.03 | 39.86 | 37.97 | 2.72 |
|
||||
| 4090 | 64 (1) | | 77.29 | 79.44 | 77.67 | -0.49 |
|
||||
|
||||
|
||||
|
||||
### FP32 benchmark
|
||||
|
||||
The table below shows the benchmark results for inference using `fp32`. In this case, `torch.nn.functional.scaled_dot_product_attention` is faster than `xFormers` on all the GPUs we tested.
|
||||
|
||||
Using `torch.compile` in addition to the accelerated transformers implementation can yield up to 19% performance improvement over `xFormers` in Ampere and Ada cards, and up to 20% (Ampere) or 28% (Ada) over vanilla attention.
|
||||
|
||||
| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) | Speed over vanilla (%) |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| A100 | 4 | 16.56 | 12.42 | 12.2 | 11.84 | 4.67 | 28.50 |
|
||||
| A100 | 10 | OOM | 29.93 | 29.44 | 28.5 | 4.78 | |
|
||||
| A100 | 16 | | 47.08 | 46.27 | 44.8 | 4.84 | |
|
||||
| A100 | 32 | | 92.89 | 91.34 | 88.35 | 4.89 | |
|
||||
| A100 | 64 | | 185.3 | 182.71 | 176.48 | 4.76 | |
|
||||
| | | | | | | |
|
||||
| A10 | 1 | 10.59 | 8.81 | 7.51 | 7.35 | 16.57 | 30.59 |
|
||||
| A10 | 4 | 34.77 | 27.63 | 22.77 | 22.07 | 20.12 | 36.53 |
|
||||
| A10 | 8 | | 56.19 | 43.53 | 43.86 | 21.94 | |
|
||||
| A10 | 16 | | 116.49 | 88.56 | 86.64 | 25.62 | |
|
||||
| A10 | 32 | | 221.95 | 175.74 | 168.18 | 24.23 | |
|
||||
| A10 | 48 | | 333.23 | 264.84 | | 20.52 | |
|
||||
| | | | | | | |
|
||||
| T4 | 1 | 28.2 | 24.49 | 23.93 | 23.56 | 3.80 | 16.45 |
|
||||
| T4 | 2 | 52.77 | 45.7 | 45.88 | 45.06 | 1.40 | 14.61 |
|
||||
| T4 | 4 | OOM | 85.72 | 85.78 | 84.48 | 1.45 | |
|
||||
| T4 | 8 | | 149.64 | 150.75 | 148.4 | 0.83 | |
|
||||
| | | | | | | |
|
||||
| V100 | 1 | 7.4 | 6.84 | 6.8 | 6.66 | 2.63 | 10.00 |
|
||||
| V100 | 2 | 13.85 | 12.81 | 12.66 | 12.35 | 3.59 | 10.83 |
|
||||
| V100 | 4 | OOM | 25.73 | 25.31 | 24.78 | 3.69 | |
|
||||
| V100 | 8 | | 43.95 | 43.37 | 42.25 | 3.87 | |
|
||||
| V100 | 16 | | 84.99 | 84.73 | 82.55 | 2.87 | |
|
||||
| | | | | | | |
|
||||
| 3090 | 1 | 7.09 | 6.78 | 6.11 | 6.03 | 11.06 | 14.95 |
|
||||
| 3090 | 4 | 22.69 | 21.45 | 18.67 | 18.09 | 15.66 | 20.27 |
|
||||
| 3090 | 8 | | 42.59 | 36.75 | 35.59 | 16.44 | |
|
||||
| 3090 | 16 | | 85.35 | 72.37 | 70.25 | 17.69 | |
|
||||
| 3090 | 32 (1) | | 162.05 | 138.99 | 134.53 | 16.98 | |
|
||||
| 3090 | 48 | | 241.91 | 207.75 | | 14.12 | |
|
||||
| | | | | | | |
|
||||
| 3090 Ti | 1 | 6.45 | 6.19 | 5.64 | 5.49 | 11.31 | 14.88 |
|
||||
| 3090 Ti | 4 | 20.32 | 19.31 | 16.9 | 16.37 | 15.23 | 19.44 |
|
||||
| 3090 Ti | 8 (2) | | 37.93 | 33.05 | 31.99 | 15.66 | |
|
||||
| 3090 Ti | 16 | | 75.37 | 65.25 | 64.32 | 14.66 | |
|
||||
| 3090 Ti | 32 (1) | | 142.55 | 124.44 | 120.74 | 15.30 | |
|
||||
| 3090 Ti | 48 | | 213.19 | 186.55 | | 12.50 | |
|
||||
| | | | | | | |
|
||||
| 4090 | 1 | 5.54 | 4.99 | 4.51 | 4.44 | 11.02 | 19.86 |
|
||||
| 4090 | 4 | 13.67 | 11.4 | 10.3 | 9.84 | 13.68 | 28.02 |
|
||||
| 4090 | 8 | | 19.79 | 17.13 | 16.19 | 18.19 | |
|
||||
| 4090 | 16 | | 38.62 | 33.14 | 32.31 | 16.34 | |
|
||||
| 4090 | 32 (1) | | 76.57 | 65.96 | 62.05 | 18.96 | |
|
||||
| 4090 | 48 | | 114.44 | 98.78 | | 13.68 | |
|
||||
|
||||
|
||||
|
||||
(1) Batch Size >= 32 requires enable_vae_slicing() because of https://github.com/pytorch/pytorch/issues/81665
|
||||
This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and batch size of 64
|
||||
|
||||
For more details about how this benchmark was run, please refer to [this PR](https://github.com/huggingface/diffusers/pull/2303).
|
||||
@@ -1,35 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Installing xFormers
|
||||
|
||||
We recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for both inference and training. In our tests, the optimizations performed in the attention blocks allow for both faster speed and reduced memory consumption.
|
||||
|
||||
Starting from version `0.0.16` of xFormers, released on January 2023, installation can be easily performed using pre-built pip wheels:
|
||||
|
||||
```bash
|
||||
pip install xformers
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
The xFormers PIP package requires the latest version of PyTorch (1.13.1 as of xFormers 0.0.16). If you need to use a previous version of PyTorch, then we recommend you install xFormers from source using [the project instructions](https://github.com/facebookresearch/xformers#installing-xformers).
|
||||
|
||||
</Tip>
|
||||
|
||||
After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption, as discussed [here](fp16#memory-efficient-attention).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or Dreambooth) in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
|
||||
|
||||
</Tip>
|
||||
@@ -1,313 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Quicktour
|
||||
|
||||
Diffusion models are trained to denoise random Gaussian noise step-by-step to generate a sample of interest, such as an image or audio. This has sparked a tremendous amount of interest in generative AI, and you have probably seen examples of diffusion generated images on the internet. 🧨 Diffusers is a library aimed at making diffusion models widely accessible to everyone.
|
||||
|
||||
Whether you're a developer or an everyday user, this quicktour will introduce you to 🧨 Diffusers and help you get up and generating quickly! There are three main components of the library to know about:
|
||||
|
||||
* The [`DiffusionPipeline`] is a high-level end-to-end class designed to rapidly generate samples from pretrained diffusion models for inference.
|
||||
* Popular pretrained [model](./api/models) architectures and modules that can be used as building blocks for creating diffusion systems.
|
||||
* Many different [schedulers](./api/schedulers/overview) - algorithms that control how noise is added for training, and how to generate denoised images during inference.
|
||||
|
||||
The quicktour will show you how to use the [`DiffusionPipeline`] for inference, and then walk you through how to combine a model and scheduler to replicate what's happening inside the [`DiffusionPipeline`].
|
||||
|
||||
<Tip>
|
||||
|
||||
The quicktour is a simplified version of the introductory 🧨 Diffusers [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) to help you get started quickly. If you want to learn more about 🧨 Diffusers goal, design philosophy, and additional details about it's core API, check out the notebook!
|
||||
|
||||
</Tip>
|
||||
|
||||
Before you begin, make sure you have all the necessary libraries installed:
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers accelerate transformers
|
||||
```
|
||||
|
||||
- [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training.
|
||||
- [🤗 Transformers](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview).
|
||||
|
||||
## DiffusionPipeline
|
||||
|
||||
The [`DiffusionPipeline`] is the easiest way to use a pretrained diffusion system for inference. It is an end-to-end system containing the model and the scheduler. You can use the [`DiffusionPipeline`] out-of-the-box for many tasks. Take a look at the table below for some supported tasks, and for a complete list of supported tasks, check out the [🧨 Diffusers Summary](./api/pipelines/overview#diffusers-summary) table.
|
||||
|
||||
| **Task** | **Description** | **Pipeline**
|
||||
|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|
|
||||
| Unconditional Image Generation | generate an image from Gaussian noise | [unconditional_image_generation](./using-diffusers/unconditional_image_generation) |
|
||||
| Text-Guided Image Generation | generate an image given a text prompt | [conditional_image_generation](./using-diffusers/conditional_image_generation) |
|
||||
| Text-Guided Image-to-Image Translation | adapt an image guided by a text prompt | [img2img](./using-diffusers/img2img) |
|
||||
| Text-Guided Image-Inpainting | fill the masked part of an image given the image, the mask and a text prompt | [inpaint](./using-diffusers/inpaint) |
|
||||
| Text-Guided Depth-to-Image Translation | adapt parts of an image guided by a text prompt while preserving structure via depth estimation | [depth2img](./using-diffusers/depth2img) |
|
||||
|
||||
Start by creating an instance of a [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
|
||||
You can use the [`DiffusionPipeline`] for any [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) stored on the Hugging Face Hub.
|
||||
In this quicktour, you'll load the [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image generation.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
For [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) models, please carefully read the [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) first before running the model. 🧨 Diffusers implements a [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to prevent offensive or harmful content, but the model's improved image generation capabilities can still produce potentially harmful content.
|
||||
|
||||
</Tip>
|
||||
|
||||
Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
|
||||
|
||||
```python
|
||||
>>> from diffusers import DiffusionPipeline
|
||||
|
||||
>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
```
|
||||
|
||||
The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. You'll see that the Stable Diffusion pipeline is composed of the [`UNet2DConditionModel`] and [`PNDMScheduler`] among other things:
|
||||
|
||||
```py
|
||||
>>> pipeline
|
||||
StableDiffusionPipeline {
|
||||
"_class_name": "StableDiffusionPipeline",
|
||||
"_diffusers_version": "0.13.1",
|
||||
...,
|
||||
"scheduler": [
|
||||
"diffusers",
|
||||
"PNDMScheduler"
|
||||
],
|
||||
...,
|
||||
"unet": [
|
||||
"diffusers",
|
||||
"UNet2DConditionModel"
|
||||
],
|
||||
"vae": [
|
||||
"diffusers",
|
||||
"AutoencoderKL"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
We strongly recommend running the pipeline on a GPU because the model consists of roughly 1.4 billion parameters.
|
||||
You can move the generator object to a GPU, just like you would in PyTorch:
|
||||
|
||||
```python
|
||||
>>> pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Now you can pass a text prompt to the `pipeline` to generate an image, and then access the denoised image. By default, the image output is wrapped in a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.
|
||||
|
||||
```python
|
||||
>>> image = pipeline("An image of a squirrel in Picasso style").images[0]
|
||||
>>> image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_of_squirrel_painting.png"/>
|
||||
</div>
|
||||
|
||||
Save the image by calling `save`:
|
||||
|
||||
```python
|
||||
>>> image.save("image_of_squirrel_painting.png")
|
||||
```
|
||||
|
||||
### Local pipeline
|
||||
|
||||
You can also use the pipeline locally. The only difference is you need to download the weights first:
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
Then load the saved weights into the pipeline:
|
||||
|
||||
```python
|
||||
>>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
|
||||
```
|
||||
|
||||
Now you can run the pipeline as you would in the section above.
|
||||
|
||||
### Swapping schedulers
|
||||
|
||||
Different schedulers come with different denoising speeds and quality trade-offs. The best way to find out which one works best for you is to try them out! One of the main features of 🧨 Diffusers is to allow you to easily switch between schedulers. For example, to replace the default [`PNDMScheduler`] with the [`EulerDiscreteScheduler`], load it with the [`~diffusers.ConfigMixin.from_config`] method:
|
||||
|
||||
```py
|
||||
>>> from diffusers import EulerDiscreteScheduler
|
||||
|
||||
>>> pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
```
|
||||
|
||||
Try generating an image with the new scheduler and see if you notice a difference!
|
||||
|
||||
In the next section, you'll take a closer look at the components - the model and scheduler - that make up the [`DiffusionPipeline`] and learn how to use these components to generate an image of a cat.
|
||||
|
||||
## Models
|
||||
|
||||
Most models take a noisy sample, and at each timestep it predicts the *noise residual* (other models learn to predict the previous sample directly or the velocity or [`v-prediction`](https://github.com/huggingface/diffusers/blob/5e5ce13e2f89ac45a0066cb3f369462a3cf1d9ef/src/diffusers/schedulers/scheduling_ddim.py#L110)), the difference between a less noisy image and the input image. You can mix and match models to create other diffusion systems.
|
||||
|
||||
Models are initiated with the [`~ModelMixin.from_pretrained`] method which also locally caches the model weights so it is faster the next time you load the model. For the quicktour, you'll load the [`UNet2DModel`], a basic unconditional image generation model with a checkpoint trained on cat images:
|
||||
|
||||
```py
|
||||
>>> from diffusers import UNet2DModel
|
||||
|
||||
>>> repo_id = "google/ddpm-cat-256"
|
||||
>>> model = UNet2DModel.from_pretrained(repo_id)
|
||||
```
|
||||
|
||||
To access the model parameters, call `model.config`:
|
||||
|
||||
```py
|
||||
>>> model.config
|
||||
```
|
||||
|
||||
The model configuration is a 🧊 frozen 🧊 dictionary, which means those parameters can't be changed after the model is created. This is intentional and ensures that the parameters used to define the model architecture at the start remain the same, while other parameters can still be adjusted during inference.
|
||||
|
||||
Some of the most important parameters are:
|
||||
|
||||
* `sample_size`: the height and width dimension of the input sample.
|
||||
* `in_channels`: the number of input channels of the input sample.
|
||||
* `down_block_types` and `up_block_types`: the type of down- and upsampling blocks used to create the UNet architecture.
|
||||
* `block_out_channels`: the number of output channels of the downsampling blocks; also used in reverse order for the number of input channels of the upsampling blocks.
|
||||
* `layers_per_block`: the number of ResNet blocks present in each UNet block.
|
||||
|
||||
To use the model for inference, create the image shape with random Gaussian noise. It should have a `batch` axis because the model can receive multiple random noises, a `channel` axis corresponding to the number of input channels, and a `sample_size` axis for the height and width of the image:
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
|
||||
>>> torch.manual_seed(0)
|
||||
|
||||
>>> noisy_sample = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
>>> noisy_sample.shape
|
||||
torch.Size([1, 3, 256, 256])
|
||||
```
|
||||
|
||||
For inference, pass the noisy image to the model and a `timestep`. The `timestep` indicates how noisy the input image is, with more noise at the beginning and less at the end. This helps the model determine its position in the diffusion process, whether it is closer to the start or the end. Use the `sample` method to get the model output:
|
||||
|
||||
```py
|
||||
>>> with torch.no_grad():
|
||||
... noisy_residual = model(sample=noisy_sample, timestep=2).sample
|
||||
```
|
||||
|
||||
To generate actual examples though, you'll need a scheduler to guide the denoising process. In the next section, you'll learn how to couple a model with a scheduler.
|
||||
|
||||
## Schedulers
|
||||
|
||||
Schedulers manage going from a noisy sample to a less noisy sample given the model output - in this case, it is the `noisy_residual`.
|
||||
|
||||
<Tip>
|
||||
|
||||
🧨 Diffusers is a toolbox for building diffusion systems. While the [`DiffusionPipeline`] is a convenient way to get started with a pre-built diffusion system, you can also choose your own the model and scheduler components separately to build a custom diffusion system.
|
||||
|
||||
</Tip>
|
||||
|
||||
For the quicktour, you'll instantiate the [`DDPMScheduler`] with it's [`~diffusers.ConfigMixin.from_config`] method:
|
||||
|
||||
```py
|
||||
>>> from diffusers import DDPMScheduler
|
||||
|
||||
>>> scheduler = DDPMScheduler.from_config(repo_id)
|
||||
>>> scheduler
|
||||
DDPMScheduler {
|
||||
"_class_name": "DDPMScheduler",
|
||||
"_diffusers_version": "0.13.1",
|
||||
"beta_end": 0.02,
|
||||
"beta_schedule": "linear",
|
||||
"beta_start": 0.0001,
|
||||
"clip_sample": true,
|
||||
"clip_sample_range": 1.0,
|
||||
"num_train_timesteps": 1000,
|
||||
"prediction_type": "epsilon",
|
||||
"trained_betas": null,
|
||||
"variance_type": "fixed_small"
|
||||
}
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Notice how the scheduler is instantiated from a configuration. Unlike a model, a scheduler does not have trainable weights and is parameter-free!
|
||||
|
||||
</Tip>
|
||||
|
||||
Some of the most important parameters are:
|
||||
|
||||
* `num_train_timesteps`: the length of the denoising process or in other words, the number of timesteps required to process random Gaussian noise into a data sample.
|
||||
* `beta_schedule`: the type of noise schedule to use for inference and training.
|
||||
* `beta_start` and `beta_end`: the start and end noise values for the noise schedule.
|
||||
|
||||
To predict a slightly less noisy image, pass the following to the scheduler's [`~diffusers.DDPMScheduler.step`] method: model output, `timestep`, and current `sample`.
|
||||
|
||||
```py
|
||||
>>> less_noisy_sample = scheduler.step(model_output=noisy_residual, timestep=2, sample=noisy_sample).prev_sample
|
||||
>>> less_noisy_sample.shape
|
||||
```
|
||||
|
||||
The `less_noisy_sample` can be passed to the next `timestep` where it'll get even less noisier! Let's bring it all together now and visualize the entire denoising process.
|
||||
|
||||
First, create a function that postprocesses and displays the denoised image as a `PIL.Image`:
|
||||
|
||||
```py
|
||||
>>> import PIL.Image
|
||||
>>> import numpy as np
|
||||
|
||||
|
||||
>>> def display_sample(sample, i):
|
||||
... image_processed = sample.cpu().permute(0, 2, 3, 1)
|
||||
... image_processed = (image_processed + 1.0) * 127.5
|
||||
... image_processed = image_processed.numpy().astype(np.uint8)
|
||||
|
||||
... image_pil = PIL.Image.fromarray(image_processed[0])
|
||||
... display(f"Image at step {i}")
|
||||
... display(image_pil)
|
||||
```
|
||||
|
||||
To speed up the denoising process, move the input and model to a GPU:
|
||||
|
||||
```py
|
||||
>>> model.to("cuda")
|
||||
>>> noisy_sample = noisy_sample.to("cuda")
|
||||
```
|
||||
|
||||
Now create a denoising loop that predicts the residual of the less noisy sample, and computes the less noisy sample with the scheduler:
|
||||
|
||||
```py
|
||||
>>> import tqdm
|
||||
|
||||
>>> sample = noisy_sample
|
||||
|
||||
>>> for i, t in enumerate(tqdm.tqdm(scheduler.timesteps)):
|
||||
... # 1. predict noise residual
|
||||
... with torch.no_grad():
|
||||
... residual = model(sample, t).sample
|
||||
|
||||
... # 2. compute less noisy image and set x_t -> x_t-1
|
||||
... sample = scheduler.step(residual, t, sample).prev_sample
|
||||
|
||||
... # 3. optionally look at image
|
||||
... if (i + 1) % 50 == 0:
|
||||
... display_sample(sample, i + 1)
|
||||
```
|
||||
|
||||
Sit back and watch as a cat is generated from nothing but noise! 😻
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/diffusion-quicktour.png"/>
|
||||
</div>
|
||||
|
||||
## Next steps
|
||||
|
||||
Hopefully you generated some cool images with 🧨 Diffusers in this quicktour! For your next steps, you can:
|
||||
|
||||
* Train or finetune a model to generate your own images in the [training](./tutorials/basic_training) tutorial.
|
||||
* See example official and community [training or finetuning scripts](https://github.com/huggingface/diffusers/tree/main/examples#-diffusers-examples) for a variety of use cases.
|
||||
* Learn more about loading, accessing, changing and comparing schedulers in the [Using different Schedulers](./using-diffusers/schedulers) guide.
|
||||
* Explore prompt engineering, speed and memory optimizations, and tips and tricks for generating higher quality images with the [Stable Diffusion](./stable_diffusion) guide.
|
||||
* Dive deeper into speeding up 🧨 Diffusers with guides on [optimized PyTorch on a GPU](./optimization/fp16), and inference guides for running [Stable Diffusion on Apple Silicon (M1/M2)](./optimization/mps) and [ONNX Runtime](./optimization/onnx).
|
||||
@@ -1,333 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# The Stable Diffusion Guide 🎨
|
||||
<a target="_blank" href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_101_guide.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
</a>
|
||||
|
||||
## Intro
|
||||
|
||||
Stable Diffusion is a [Latent Diffusion model](https://github.com/CompVis/latent-diffusion) developed by researchers from the Machine Vision and Learning group at LMU Munich, *a.k.a* CompVis.
|
||||
Model checkpoints were publicly released at the end of August 2022 by a collaboration of Stability AI, CompVis, and Runway with support from EleutherAI and LAION. For more information, you can check out [the official blog post](https://stability.ai/blog/stable-diffusion-public-release).
|
||||
|
||||
Since its public release the community has done an incredible job at working together to make the stable diffusion checkpoints **faster**, **more memory efficient**, and **more performant**.
|
||||
|
||||
🧨 Diffusers offers a simple API to run stable diffusion with all memory, computing, and quality improvements.
|
||||
|
||||
This notebook walks you through the improvements one-by-one so you can best leverage [`StableDiffusionPipeline`] for **inference**.
|
||||
|
||||
## Prompt Engineering 🎨
|
||||
|
||||
When running *Stable Diffusion* in inference, we usually want to generate a certain type, or style of image and then improve upon it. Improving upon a previously generated image means running inference over and over again with a different prompt and potentially a different seed until we are happy with our generation.
|
||||
|
||||
So to begin with, it is most important to speed up stable diffusion as much as possible to generate as many pictures as possible in a given amount of time.
|
||||
|
||||
This can be done by both improving the **computational efficiency** (speed) and the **memory efficiency** (GPU RAM).
|
||||
|
||||
Let's start by looking into computational efficiency first.
|
||||
|
||||
Throughout the notebook, we will focus on [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5):
|
||||
|
||||
``` python
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
```
|
||||
|
||||
Let's load the pipeline.
|
||||
|
||||
## Speed Optimization
|
||||
|
||||
``` python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id)
|
||||
```
|
||||
|
||||
We aim at generating a beautiful photograph of an *old warrior chief* and will later try to find the best prompt to generate such a photograph. For now, let's keep the prompt simple:
|
||||
|
||||
``` python
|
||||
prompt = "portrait photo of a old warrior chief"
|
||||
```
|
||||
|
||||
To begin with, we should make sure we run inference on GPU, so let's move the pipeline to GPU, just like you would with any PyTorch module.
|
||||
|
||||
``` python
|
||||
pipe = pipe.to("cuda")
|
||||
```
|
||||
|
||||
To generate an image, you should use the [~`StableDiffusionPipeline.__call__`] method.
|
||||
|
||||
To make sure we can reproduce more or less the same image in every call, let's make use of the generator. See the documentation on reproducibility [here](./conceptual/reproducibility) for more information.
|
||||
|
||||
``` python
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
```
|
||||
|
||||
Now, let's take a spin on it.
|
||||
|
||||
``` python
|
||||
image = pipe(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Cool, this now took roughly 30 seconds on a T4 GPU (you might see faster inference if your allocated GPU is better than a T4).
|
||||
|
||||
The default run we did above used full float32 precision and ran the default number of inference steps (50). The easiest speed-ups come from switching to float16 (or half) precision and simply running fewer inference steps. Let's load the model now in float16 instead.
|
||||
|
||||
``` python
|
||||
import torch
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
```
|
||||
|
||||
And we can again call the pipeline to generate an image.
|
||||
|
||||
``` python
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipe(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||

|
||||
|
||||
Cool, this is almost three times as fast for arguably the same image quality.
|
||||
|
||||
We strongly suggest always running your pipelines in float16 as so far we have very rarely seen degradations in quality because of it.
|
||||
|
||||
Next, let's see if we need to use 50 inference steps or whether we could use significantly fewer. The number of inference steps is associated with the denoising scheduler we use. Choosing a more efficient scheduler could help us decrease the number of steps.
|
||||
|
||||
Let's have a look at all the schedulers the stable diffusion pipeline is compatible with.
|
||||
|
||||
``` python
|
||||
pipe.scheduler.compatibles
|
||||
```
|
||||
|
||||
```
|
||||
[diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
|
||||
diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_pndm.PNDMScheduler,
|
||||
diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
|
||||
diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
|
||||
diffusers.schedulers.scheduling_ddim.DDIMScheduler]
|
||||
```
|
||||
|
||||
Cool, that's a lot of schedulers.
|
||||
|
||||
🧨 Diffusers is constantly adding a bunch of novel schedulers/samplers that can be used with Stable Diffusion. For more information, we recommend taking a look at the official documentation [here](https://huggingface.co/docs/diffusers/main/en/api/schedulers/overview).
|
||||
|
||||
Alright, right now Stable Diffusion is using the `PNDMScheduler` which usually requires around 50 inference steps. However, other schedulers such as `DPMSolverMultistepScheduler` or `DPMSolverSinglestepScheduler` seem to get away with just 20 to 25 inference steps. Let's try them out.
|
||||
|
||||
You can set a new scheduler by making use of the [from_config](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) function.
|
||||
|
||||
``` python
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
```
|
||||
|
||||
Now, let's try to reduce the number of inference steps to just 20.
|
||||
|
||||
``` python
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
The image now does look a little different, but it's arguably still of equally high quality. We now cut inference time to just 4 seconds though 😍.
|
||||
|
||||
## Memory Optimization
|
||||
|
||||
Less memory used in generation indirectly implies more speed, since we're often trying to maximize how many images we can generate per second. Usually, the more images per inference run, the more images per second too.
|
||||
|
||||
The easiest way to see how many images we can generate at once is to simply try it out, and see when we get a *"Out-of-memory (OOM)"* error.
|
||||
|
||||
We can run batched inference by simply passing a list of prompts and generators. Let's define a quick function that generates a batch for us.
|
||||
|
||||
``` python
|
||||
def get_inputs(batch_size=1):
|
||||
generator = [torch.Generator("cuda").manual_seed(i) for i in range(batch_size)]
|
||||
prompts = batch_size * [prompt]
|
||||
num_inference_steps = 20
|
||||
|
||||
return {"prompt": prompts, "generator": generator, "num_inference_steps": num_inference_steps}
|
||||
```
|
||||
This function returns a list of prompts and a list of generators, so we can reuse the generator that produced a result we like.
|
||||
|
||||
We also need a method that allows us to easily display a batch of images.
|
||||
|
||||
``` python
|
||||
from PIL import Image
|
||||
|
||||
def image_grid(imgs, rows=2, cols=2):
|
||||
w, h = imgs[0].size
|
||||
grid = Image.new('RGB', size=(cols*w, rows*h))
|
||||
|
||||
for i, img in enumerate(imgs):
|
||||
grid.paste(img, box=(i%cols*w, i//cols*h))
|
||||
return grid
|
||||
```
|
||||
|
||||
Cool, let's see how much memory we can use starting with `batch_size=4`.
|
||||
|
||||
``` python
|
||||
images = pipe(**get_inputs(batch_size=4)).images
|
||||
image_grid(images)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Going over a batch_size of 4 will error out in this notebook (assuming we are running it on a T4 GPU). Also, we can see we only generate slightly more images per second (3.75s/image) compared to 4s/image previously.
|
||||
|
||||
However, the community has found some nice tricks to improve the memory constraints further. After stable diffusion was released, the community found improvements within days and shared them freely over GitHub - open-source at its finest! I believe the original idea came from [this](https://github.com/basujindal/stable-diffusion/pull/117) GitHub thread.
|
||||
|
||||
By far most of the memory is taken up by the cross-attention layers. Instead of running this operation in batch, one can run it sequentially to save a significant amount of memory.
|
||||
|
||||
It can easily be enabled by calling `enable_attention_slicing` as is documented [here](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.enable_attention_slicing).
|
||||
|
||||
``` python
|
||||
pipe.enable_attention_slicing()
|
||||
```
|
||||
|
||||
Great, now that attention slicing is enabled, let's try to double the batch size again, going for `batch_size=8`.
|
||||
|
||||
``` python
|
||||
images = pipe(**get_inputs(batch_size=8)).images
|
||||
image_grid(images, rows=2, cols=4)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Nice, it works. However, the speed gain is again not very big (it might however be much more significant on other GPUs).
|
||||
|
||||
We're at roughly 3.5 seconds per image 🔥 which is probably the fastest we can be with a simple T4 without sacrificing quality.
|
||||
|
||||
Next, let's look into how to improve the quality!
|
||||
|
||||
## Quality Improvements
|
||||
|
||||
Now that our image generation pipeline is blazing fast, let's try to get maximum image quality.
|
||||
|
||||
First of all, image quality is extremely subjective, so it's difficult to make general claims here.
|
||||
|
||||
The most obvious step to take to improve quality is to use *better checkpoints*. Since the release of Stable Diffusion, many improved versions have been released, which are summarized here:
|
||||
|
||||
- *Official Release - 22 Aug 2022*: [Stable-Diffusion 1.4](https://huggingface.co/CompVis/stable-diffusion-v1-4)
|
||||
- *20 October 2022*: [Stable-Diffusion 1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
|
||||
- *24 Nov 2022*: [Stable-Diffusion 2.0](https://huggingface.co/stabilityai/stable-diffusion-2-0)
|
||||
- *7 Dec 2022*: [Stable-Diffusion 2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
|
||||
|
||||
Newer versions don't necessarily mean better image quality with the same parameters. People mentioned that *2.0* is slightly worse than *1.5* for certain prompts, but given the right prompt engineering *2.0* and *2.1* seem to be better.
|
||||
|
||||
Overall, we strongly recommend just trying the models out and reading up on advice online (e.g. it has been shown that using negative prompts is very important for 2.0 and 2.1 to get the highest possible quality. See for example [this nice blog post](https://minimaxir.com/2022/11/stable-diffusion-negative-prompt/).
|
||||
|
||||
Additionally, the community has started fine-tuning many of the above versions on certain styles with some of them having an extremely high quality and gaining a lot of traction.
|
||||
|
||||
We recommend having a look at all [diffusers checkpoints sorted by downloads and trying out the different checkpoints](https://huggingface.co/models?library=diffusers).
|
||||
|
||||
For the following, we will stick to v1.5 for simplicity.
|
||||
|
||||
Next, we can also try to optimize single components of the pipeline, e.g. switching out the latent decoder. For more details on how the whole Stable Diffusion pipeline works, please have a look at [this blog post](https://huggingface.co/blog/stable_diffusion).
|
||||
|
||||
Let's load [stabilityai's newest auto-decoder](https://huggingface.co/stabilityai/stable-diffusion-2-1).
|
||||
|
||||
``` python
|
||||
from diffusers import AutoencoderKL
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Now we can set it to the vae of the pipeline to use it.
|
||||
|
||||
``` python
|
||||
pipe.vae = vae
|
||||
```
|
||||
|
||||
Let's run the same prompt as before to compare quality.
|
||||
|
||||
``` python
|
||||
images = pipe(**get_inputs(batch_size=8)).images
|
||||
image_grid(images, rows=2, cols=4)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Seems like the difference is only very minor, but the new generations are arguably a bit *sharper*.
|
||||
|
||||
Cool, finally, let's look a bit into prompt engineering.
|
||||
|
||||
Our goal was to generate a photo of an old warrior chief. Let's now try to bring a bit more color into the photos and make the look more impressive.
|
||||
|
||||
Originally our prompt was "*portrait photo of an old warrior chief*".
|
||||
|
||||
To improve the prompt, it often helps to add cues that could have been used online to save high-quality photos, as well as add more details.
|
||||
Essentially, when doing prompt engineering, one has to think:
|
||||
|
||||
- How was the photo or similar photos of the one I want probably stored on the internet?
|
||||
- What additional detail can I give that steers the models into the style that I want?
|
||||
|
||||
Cool, let's add more details.
|
||||
|
||||
``` python
|
||||
prompt += ", tribal panther make up, blue on red, side profile, looking away, serious eyes"
|
||||
```
|
||||
|
||||
and let's also add some cues that usually help to generate higher quality images.
|
||||
|
||||
``` python
|
||||
prompt += " 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta"
|
||||
prompt
|
||||
```
|
||||
|
||||
Cool, let's now try this prompt.
|
||||
|
||||
``` python
|
||||
images = pipe(**get_inputs(batch_size=8)).images
|
||||
image_grid(images, rows=2, cols=4)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Pretty impressive! We got some very high-quality image generations there. The 2nd image is my personal favorite, so I'll re-use this seed and see whether I can tweak the prompts slightly by using "oldest warrior", "old", "", and "young" instead of "old".
|
||||
|
||||
``` python
|
||||
prompts = [
|
||||
"portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
|
||||
"portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
|
||||
"portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
|
||||
"portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
|
||||
]
|
||||
|
||||
generator = [torch.Generator("cuda").manual_seed(1) for _ in range(len(prompts))] # 1 because we want the 2nd image
|
||||
|
||||
images = pipe(prompt=prompts, generator=generator, num_inference_steps=25).images
|
||||
image_grid(images)
|
||||
```
|
||||
|
||||

|
||||
|
||||
The first picture looks nice! The eye movement slightly changed and looks nice. This finished up our 101-guide on how to use Stable Diffusion 🤗.
|
||||
|
||||
For more information on optimization or other guides, I recommend taking a look at the following:
|
||||
|
||||
- [Blog post about Stable Diffusion](https://huggingface.co/blog/stable_diffusion): In-detail blog post explaining Stable Diffusion.
|
||||
- [FlashAttention](https://huggingface.co/docs/diffusers/optimization/xformers): XFormers flash attention can optimize your model even further with more speed and memory improvements.
|
||||
- [Dreambooth](https://huggingface.co/docs/diffusers/training/dreambooth) - Quickly customize the model by fine-tuning it.
|
||||
- [General info on Stable Diffusion](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/overview) - Info on other tasks that are powered by Stable Diffusion.
|
||||
@@ -1,472 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DreamBooth
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like Stable Diffusion given just a few (3-5) images of a subject. It allows the model to generate contextualized images of the subject in different scenes, poses, and views.
|
||||
|
||||

|
||||
<small>Dreambooth examples from the <a href="https://dreambooth.github.io">project's blog.</a></small>
|
||||
|
||||
This guide will show you how to finetune DreamBooth with the [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) model for various GPU sizes, and with Flax. All the training scripts for DreamBooth used in this guide can be found [here](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) if you're interested in digging deeper and seeing how things work.
|
||||
|
||||
Before running the scripts, make sure you install the library's training dependencies. We also recommend installing 🧨 Diffusers from the `main` GitHub branch:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers
|
||||
pip install -U -r diffusers/examples/dreambooth/requirements.txt
|
||||
```
|
||||
|
||||
xFormers is not part of the training requirements, but we recommend you [install](../optimization/xformers) it if you can because it could make your training faster and less memory intensive.
|
||||
|
||||
After all the dependencies have been set up, initialize a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
To setup a default 🤗 Accelerate environment without choosing any configurations:
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
## Finetuning
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects to help you choose the appropriate hyperparameters.
|
||||
|
||||
</Tip>
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Let's try DreamBooth with a [few images of a dog](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ); download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
```
|
||||
|
||||
Then you can launch the training script (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)) with the following command:
|
||||
|
||||
```bash
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
If you have access to TPUs or want to train even faster, you can try out the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_flax.py). The Flax training script doesn't support gradient checkpointing or gradient accumulation, so you'll need a GPU with at least 30GB of memory.
|
||||
|
||||
Before running the script, make sure you have the requirements installed:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements.txt
|
||||
```
|
||||
|
||||
Now you can launch the training script with the following command:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Finetuning with prior-preserving loss
|
||||
|
||||
Prior preservation is used to avoid overfitting and language-drift (check out the [paper](https://arxiv.org/abs/2208.12242) to learn more if you're interested). For prior preservation, you use other images of the same class as part of the training process. The nice thing is that you can generate those images using the Stable Diffusion model itself! The training script will save the generated images to a local path you specify.
|
||||
|
||||
The author's recommend generating `num_epochs * num_samples` images for prior preservation. In most cases, 200-300 images work well.
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Finetuning the text encoder and UNet
|
||||
|
||||
The script also allows you to finetune the `text_encoder` along with the `unet`. In our experiments (check out the [Training Stable Diffusion with DreamBooth using 🧨 Diffusers](https://huggingface.co/blog/dreambooth) post for more details), this yields much better results, especially when generating images of faces.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Training the text encoder requires additional memory and it won't fit on a 16GB GPU. You'll need at least 24GB VRAM to use this option.
|
||||
|
||||
</Tip>
|
||||
|
||||
Pass the `--train_text_encoder` argument to the training script to enable finetuning the `text_encoder` and `unet`:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--use_8bit_adam
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=2e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Finetuning with LoRA
|
||||
|
||||
You can also use Low-Rank Adaptation of Large Language Models (LoRA), a fine-tuning technique for accelerating training large models, on DreamBooth. For more details, take a look at the [LoRA training](training/lora#dreambooth) guide.
|
||||
|
||||
## Saving checkpoints while training
|
||||
|
||||
It's easy to overfit while training with Dreambooth, so sometimes it's useful to save regular checkpoints during the training process. One of the intermediate checkpoints might actually work better than the final model! Pass the following argument to the training script to enable saving checkpoints:
|
||||
|
||||
```bash
|
||||
--checkpointing_steps=500
|
||||
```
|
||||
|
||||
This saves the full training state in subfolders of your `output_dir`. Subfolder names begin with the prefix `checkpoint-`, followed by the number of steps performed so far; for example, `checkpoint-1500` would be a checkpoint saved after 1500 training steps.
|
||||
|
||||
### Resume training from a saved checkpoint
|
||||
|
||||
If you want to resume training from any of the saved checkpoints, you can pass the argument `--resume_from_checkpoint` to the script and specify the name of the checkpoint you want to use. You can also use the special string `"latest"` to resume from the last saved checkpoint (the one with the largest number of steps). For example, the following would resume training from the checkpoint saved after 1500 steps:
|
||||
|
||||
```bash
|
||||
--resume_from_checkpoint="checkpoint-1500"
|
||||
```
|
||||
|
||||
This is a good opportunity to tweak some of your hyperparameters if you wish.
|
||||
|
||||
### Inference from a saved checkpoint
|
||||
|
||||
Saved checkpoints are stored in a format suitable for resuming training. They not only include the model weights, but also the state of the optimizer, data loaders, and learning rate.
|
||||
|
||||
If you have **`"accelerate>=0.16.0"`** installed, use the following code to run
|
||||
inference from an intermediate checkpoint.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, UNet2DConditionModel
|
||||
from transformers import CLIPTextModel
|
||||
import torch
|
||||
|
||||
# Load the pipeline with the same arguments (model, revision) that were used for training
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/unet")
|
||||
|
||||
# if you have trained with `--args.train_text_encoder` make sure to also load the text encoder
|
||||
text_encoder = CLIPTextModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/text_encoder")
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16)
|
||||
pipeline.to("cuda")
|
||||
|
||||
# Perform inference, or save, or push to the hub
|
||||
pipeline.save_pretrained("dreambooth-pipeline")
|
||||
```
|
||||
|
||||
If you have **`"accelerate<0.16.0"`** installed, you need to convert it to an inference pipeline first:
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# Load the pipeline with the same arguments (model, revision) that were used for training
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
pipeline = DiffusionPipeline.from_pretrained(model_id)
|
||||
|
||||
accelerator = Accelerator()
|
||||
|
||||
# Use text_encoder if `--train_text_encoder` was used for the initial training
|
||||
unet, text_encoder = accelerator.prepare(pipeline.unet, pipeline.text_encoder)
|
||||
|
||||
# Restore state from a checkpoint path. You have to use the absolute path here.
|
||||
accelerator.load_state("/sddata/dreambooth/daruma-v2-1/checkpoint-100")
|
||||
|
||||
# Rebuild the pipeline with the unwrapped models (assignment to .unet and .text_encoder should work too)
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
model_id,
|
||||
unet=accelerator.unwrap_model(unet),
|
||||
text_encoder=accelerator.unwrap_model(text_encoder),
|
||||
)
|
||||
|
||||
# Perform inference, or save, or push to the hub
|
||||
pipeline.save_pretrained("dreambooth-pipeline")
|
||||
```
|
||||
|
||||
## Optimizations for different GPU sizes
|
||||
|
||||
Depending on your hardware, there are a few different ways to optimize DreamBooth on GPUs from 16GB to just 8GB!
|
||||
|
||||
### xFormers
|
||||
|
||||
[xFormers](https://github.com/facebookresearch/xformers) is a toolbox for optimizing Transformers, and it include a [memory-efficient attention](https://facebookresearch.github.io/xformers/components/ops.html#module-xformers.ops) mechanism that is used in 🧨 Diffusers. You'll need to [install xFormers](./optimization/xformers) and then add the following argument to your training script:
|
||||
|
||||
```bash
|
||||
--enable_xformers_memory_efficient_attention
|
||||
```
|
||||
|
||||
xFormers is not available in Flax.
|
||||
|
||||
### Set gradients to none
|
||||
|
||||
Another way you can lower your memory footprint is to [set the gradients](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html) to `None` instead of zero. However, this may change certain behaviors, so if you run into any issues, try removing this argument. Add the following argument to your training script to set the gradients to `None`:
|
||||
|
||||
```bash
|
||||
--set_grads_to_none
|
||||
```
|
||||
|
||||
### 16GB GPU
|
||||
|
||||
With the help of gradient checkpointing and [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) 8-bit optimizer, it's possible to train DreamBooth on a 16GB GPU. Make sure you have bitsandbytes installed:
|
||||
|
||||
```bash
|
||||
pip install bitsandbytes
|
||||
```
|
||||
|
||||
Then pass the `--use_8bit_adam` option to the training script:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=2 --gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### 12GB GPU
|
||||
|
||||
To run DreamBooth on a 12GB GPU, you'll need to enable gradient checkpointing, the 8-bit optimizer, xFormers, and set the gradients to `None`:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 --gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--enable_xformers_memory_efficient_attention \
|
||||
--set_grads_to_none \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### 8 GB GPU
|
||||
|
||||
For 8GB GPUs, you'll need the help of [DeepSpeed](https://www.deepspeed.ai/) to offload some
|
||||
tensors from the VRAM to either the CPU or NVME, enabling training with less GPU memory.
|
||||
|
||||
Run the following command to configure your 🤗 Accelerate environment:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
During configuration, confirm that you want to use DeepSpeed. Now it's possible to train on under 8GB VRAM by combining DeepSpeed stage 2, fp16 mixed precision, and offloading the model parameters and the optimizer state to the CPU. The drawback is that this requires more system RAM, about 25 GB. See [the DeepSpeed documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more configuration options.
|
||||
|
||||
You should also change the default Adam optimizer to DeepSpeed's optimized version of Adam
|
||||
[`deepspeed.ops.adam.DeepSpeedCPUAdam`](https://deepspeed.readthedocs.io/en/latest/optimizers.html#adam-cpu) for a substantial speedup. Enabling `DeepSpeedCPUAdam` requires your system's CUDA toolchain version to be the same as the one installed with PyTorch.
|
||||
|
||||
8-bit optimizers don't seem to be compatible with DeepSpeed at the moment.
|
||||
|
||||
Launch training with the following command:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--sample_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 --gradient_checkpointing \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800 \
|
||||
--mixed_precision=fp16
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
Once you have trained a model, specify the path to where the model is saved, and use it for inference in the [`StableDiffusionPipeline`]. Make sure your prompts include the special `identifier` used during training (`sks` in the previous examples).
|
||||
|
||||
If you have **`"accelerate>=0.16.0"`** installed, you can use the following code to run
|
||||
inference from an intermediate checkpoint:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
model_id = "path_to_saved_model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "A photo of sks dog in a bucket"
|
||||
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
|
||||
You may also run inference from any of the [saved training checkpoints](#inference-from-a-saved-checkpoint).
|
||||
@@ -1,214 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Low-Rank Adaptation of Large Language Models (LoRA)
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`].
|
||||
|
||||
</Tip>
|
||||
|
||||
[Low-Rank Adaptation of Large Language Models (LoRA)](https://arxiv.org/abs/2106.09685) is a training method that accelerates the training of large models while consuming less memory. It adds pairs of rank-decomposition weight matrices (called **update matrices**) to existing weights, and **only** trains those newly added weights. This has a couple of advantages:
|
||||
|
||||
- Previous pretrained weights are kept frozen so the model is not as prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
|
||||
- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
|
||||
- LoRA matrices are generally added to the attention layers of the original model. 🧨 Diffusers provides the [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method to load the LoRA weights into a model's attention layers. You can control the extent to which the model is adapted toward new training images via a `scale` parameter.
|
||||
- The greater memory-efficiency allows you to run fine-tuning on consumer GPUs like the Tesla T4, RTX 3080 or even the RTX 2080 Ti! GPUs like the T4 are free and readily accessible in Kaggle or Google Colab notebooks.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 LoRA is not only limited to attention layers. The authors found that amending
|
||||
the attention layers of a language model is sufficient to obtain good downstream performance with great efficiency. This is why it's common to just add the LoRA weights to the attention layers of a model. Check out the [Using LoRA for efficient Stable Diffusion fine-tuning](https://huggingface.co/blog/lora) blog for more information about how LoRA works!
|
||||
|
||||
</Tip>
|
||||
|
||||
[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository. 🧨 Diffusers now supports finetuning with LoRA for [text-to-image generation](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) and [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora). This guide will show you how to do both.
|
||||
|
||||
If you'd like to store or share your model with the community, login to your Hugging Face account (create [one](hf.co/join) if you don't have one already):
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
Finetuning a model like Stable Diffusion, which has billions of parameters, can be slow and difficult. With LoRA, it is much easier and faster to finetune a diffusion model. It can run on hardware with as little as 11GB of GPU RAM without resorting to tricks such as 8-bit optimizers.
|
||||
|
||||
### Training[[text-to-image-training]]
|
||||
|
||||
Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon.
|
||||
|
||||
To start, make sure you have the `MODEL_NAME` and `DATASET_NAME` environment variables set. The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
|
||||
export HUB_MODEL_ID="pokemon-lora"
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
```
|
||||
|
||||
There are some flags to be aware of before you start training:
|
||||
|
||||
* `--push_to_hub` stores the trained LoRA embeddings on the Hub.
|
||||
* `--report_to=wandb` reports and logs the training results to your Weights & Biases dashboard (as an example, take a look at this [report](https://wandb.ai/pcuenq/text2image-fine-tune/runs/b4k1w0tn?workspace=user-pcuenq)).
|
||||
* `--learning_rate=1e-04`, you can afford to use a higher learning rate than you normally would with LoRA.
|
||||
|
||||
Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)):
|
||||
|
||||
```bash
|
||||
accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$DATASET_NAME \
|
||||
--dataloader_num_workers=8 \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-04 \
|
||||
--max_grad_norm=1 \
|
||||
--lr_scheduler="cosine" --lr_warmup_steps=0 \
|
||||
--output_dir=${OUTPUT_DIR} \
|
||||
--push_to_hub \
|
||||
--hub_model_id=${HUB_MODEL_ID} \
|
||||
--report_to=wandb \
|
||||
--checkpointing_steps=500 \
|
||||
--validation_prompt="A pokemon with blue eyes." \
|
||||
--seed=1337
|
||||
```
|
||||
|
||||
### Inference[[text-to-image-inference]]
|
||||
|
||||
Now you can use the model for inference by loading the base model in the [`StableDiffusionPipeline`] and then the [`DPMSolverMultistepScheduler`]:
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
|
||||
|
||||
>>> model_base = "runwayml/stable-diffusion-v1-5"
|
||||
|
||||
>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
|
||||
>>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
```
|
||||
|
||||
Load the LoRA weights from your finetuned model *on top of the base model weights*, and then move the pipeline to a GPU for faster inference. When you merge the LoRA weights with the frozen pretrained model weights, you can optionally adjust how much of the weights to merge with the `scale` parameter:
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 A `scale` value of `0` is the same as not using your LoRA weights and you're only using the base model weights, and a `scale` value of `1` means you're only using the fully finetuned LoRA weights. Values between `0` and `1` interpolates between the two weights.
|
||||
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
>>> pipe.unet.load_attn_procs(model_path)
|
||||
>>> pipe.to("cuda")
|
||||
# use half the weights from the LoRA finetuned model and half the weights from the base model
|
||||
|
||||
>>> image = pipe(
|
||||
... "A pokemon with blue eyes.", num_inference_steps=25, guidance_scale=7.5, cross_attention_kwargs={"scale": 0.5}
|
||||
... ).images[0]
|
||||
# use the weights from the fully finetuned LoRA model
|
||||
|
||||
>>> image = pipe("A pokemon with blue eyes.", num_inference_steps=25, guidance_scale=7.5).images[0]
|
||||
>>> image.save("blue_pokemon.png")
|
||||
```
|
||||
|
||||
## DreamBooth
|
||||
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a finetuning technique for personalizing a text-to-image model like Stable Diffusion to generate photorealistic images of a subject in different contexts, given a few images of the subject. However, DreamBooth is very sensitive to hyperparameters and it is easy to overfit. Some important hyperparameters to consider include those that affect the training time (learning rate, number of training steps), and inference time (number of steps, scheduler type).
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Take a look at the [Training Stable Diffusion with DreamBooth using 🧨 Diffusers](https://huggingface.co/blog/dreambooth) blog for an in-depth analysis of DreamBooth experiments and recommended settings.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Training[[dreambooth-training]]
|
||||
|
||||
Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) with DreamBooth and LoRA with some 🐶 [dog images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ). Download and save these images to a directory.
|
||||
|
||||
To start, make sure you have the `MODEL_NAME` and `INSTANCE_DIR` (path to directory containing images) environment variables set. The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
```
|
||||
|
||||
There are some flags to be aware of before you start training:
|
||||
|
||||
* `--push_to_hub` stores the trained LoRA embeddings on the Hub.
|
||||
* `--report_to=wandb` reports and logs the training results to your Weights & Biases dashboard (as an example, take a look at this [report](https://wandb.ai/pcuenq/text2image-fine-tune/runs/b4k1w0tn?workspace=user-pcuenq)).
|
||||
* `--learning_rate=1e-04`, you can afford to use a higher learning rate than you normally would with LoRA.
|
||||
|
||||
Now you're ready to launch the training (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py)):
|
||||
|
||||
```bash
|
||||
accelerate launch train_dreambooth_lora.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--checkpointing_steps=100 \
|
||||
--learning_rate=1e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=500 \
|
||||
--validation_prompt="A photo of sks dog in a bucket" \
|
||||
--validation_epochs=50 \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
### Inference[[dreambooth-inference]]
|
||||
|
||||
Now you can use the model for inference by loading the base model in the [`StableDiffusionPipeline`]:
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers import StableDiffusionPipeline
|
||||
|
||||
>>> model_base = "runwayml/stable-diffusion-v1-5"
|
||||
|
||||
>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
Load the LoRA weights from your finetuned DreamBooth model *on top of the base model weights*, and then move the pipeline to a GPU for faster inference. When you merge the LoRA weights with the frozen pretrained model weights, you can optionally adjust how much of the weights to merge with the `scale` parameter:
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 A `scale` value of `0` is the same as not using your LoRA weights and you're only using the base model weights, and a `scale` value of `1` means you're only using the fully finetuned LoRA weights. Values between `0` and `1` interpolates between the two weights.
|
||||
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
>>> pipe.unet.load_attn_procs(model_path)
|
||||
>>> pipe.to("cuda")
|
||||
# use half the weights from the LoRA finetuned model and half the weights from the base model
|
||||
|
||||
>>> image = pipe(
|
||||
... "A picture of a sks dog in a bucket.",
|
||||
... num_inference_steps=25,
|
||||
... guidance_scale=7.5,
|
||||
... cross_attention_kwargs={"scale": 0.5},
|
||||
... ).images[0]
|
||||
# use the weights from the fully finetuned LoRA model
|
||||
|
||||
>>> image = pipe("A picture of a sks dog in a bucket.", num_inference_steps=25, guidance_scale=7.5).images[0]
|
||||
>>> image.save("bucket-dog.png")
|
||||
```
|
||||
@@ -1,220 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
# Text-to-image
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The text-to-image fine-tuning script is experimental. It's easy to overfit and run into issues like catastrophic forgetting. We recommend you explore different hyperparameters to get the best results on your dataset.
|
||||
|
||||
</Tip>
|
||||
|
||||
Text-to-image models like Stable Diffusion generate an image from a text prompt. This guide will show you how to finetune the [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) model on your own dataset with PyTorch and Flax. All the training scripts for text-to-image finetuning used in this guide can be found in this [repository](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) if you're interested in taking a closer look.
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers.git
|
||||
pip install -U -r requirements.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
|
||||
|
||||
## Hardware requirements
|
||||
|
||||
Using `gradient_checkpointing` and `mixed_precision`, it should be possible to finetune the model on a single 24GB GPU. For higher `batch_size`'s and faster training, it's better to use GPUs with more than 30GB of GPU memory. You can also use JAX/Flax for fine-tuning on TPUs or GPUs, which will be covered [below](#flax-jax-finetuning).
|
||||
|
||||
You can reduce your memory footprint even more by enabling memory efficient attention with xFormers. Make sure you have [xFormers installed](./optimization/xformers) and pass the `--enable_xformers_memory_efficient_attention` flag to the training script.
|
||||
|
||||
xFormers is not available for Flax.
|
||||
|
||||
## Upload model to Hub
|
||||
|
||||
Store your model on the Hub by adding the following argument to the training script:
|
||||
|
||||
```bash
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
## Save and load checkpoints
|
||||
|
||||
It is a good idea to regularly save checkpoints in case anything happens during training. To save a checkpoint, pass the following argument to the training script:
|
||||
|
||||
```bash
|
||||
--checkpointing_steps=500
|
||||
```
|
||||
|
||||
Every 500 steps, the full training state is saved in a subfolder in the `output_dir`. The checkpoint has the format `checkpoint-` followed by the number of steps trained so far. For example, `checkpoint-1500` is a checkpoint saved after 1500 training steps.
|
||||
|
||||
To load a checkpoint to resume training, pass the argument `--resume_from_checkpoint` to the training script and specify the checkpoint you want to resume from. For example, the following argument resumes training from the checkpoint saved after 1500 training steps:
|
||||
|
||||
```bash
|
||||
--resume_from_checkpoint="checkpoint-1500"
|
||||
```
|
||||
|
||||
## Fine-tuning
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
|
||||
accelerate launch train_text_to_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--use_ema \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--lr_scheduler="constant" --lr_warmup_steps=0 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
To finetune on your own dataset, prepare the dataset according to the format required by 🤗 [Datasets](https://huggingface.co/docs/datasets/index). You can [upload your dataset to the Hub](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub), or you can [prepare a local folder with your files](https://huggingface.co/docs/datasets/image_dataset#imagefolder).
|
||||
|
||||
Modify the script if you want to use custom loading logic. We left pointers in the code in the appropriate places to help you. 🤗 The example script below shows how to finetune on a local dataset in `TRAIN_DIR` and where to save the model to in `OUTPUT_DIR`:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export TRAIN_DIR="path_to_your_dataset"
|
||||
export OUTPUT_DIR="path_to_save_model"
|
||||
|
||||
accelerate launch train_text_to_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$TRAIN_DIR \
|
||||
--use_ema \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--lr_scheduler="constant" --lr_warmup_steps=0 \
|
||||
--output_dir=${OUTPUT_DIR}
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
With Flax, it's possible to train a Stable Diffusion model faster on TPUs and GPUs thanks to [@duongna211](https://github.com/duongna21). This is very efficient on TPU hardware but works great on GPUs too. The Flax training script doesn't support features like gradient checkpointing or gradient accumulation yet, so you'll need a GPU with at least 30GB of memory or a TPU v3.
|
||||
|
||||
Before running the script, make sure you have the requirements installed:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements_flax.txt
|
||||
```
|
||||
|
||||
Now you can launch the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
To finetune on your own dataset, prepare the dataset according to the format required by 🤗 [Datasets](https://huggingface.co/docs/datasets/index). You can [upload your dataset to the Hub](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub), or you can [prepare a local folder with your files](https://huggingface.co/docs/datasets/image_dataset#imagefolder).
|
||||
|
||||
Modify the script if you want to use custom loading logic. We left pointers in the code in the appropriate places to help you. 🤗 The example script below shows how to finetune on a local dataset in `TRAIN_DIR`:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export TRAIN_DIR="path_to_your_dataset"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$TRAIN_DIR \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## LoRA
|
||||
|
||||
You can also use Low-Rank Adaptation of Large Language Models (LoRA), a fine-tuning technique for accelerating training large models, for fine-tuning text-to-image models. For more details, take a look at the [LoRA training](lora#text-to-image) guide.
|
||||
|
||||
## Inference
|
||||
|
||||
Now you can load the fine-tuned model for inference by passing the model path or model name on the Hub to the [`StableDiffusionPipeline`]:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
model_path = "path_to_saved_model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = pipe(prompt="yoda").images[0]
|
||||
image.save("yoda-pokemon.png")
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
model_path = "path_to_saved_model"
|
||||
pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "yoda pokemon"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("yoda-pokemon.png")
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
@@ -1,215 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
|
||||
# Textual Inversion
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
[Textual Inversion](https://arxiv.org/abs/2208.01618) is a technique for capturing novel concepts from a small number of example images. While the technique was originally demonstrated with a [latent diffusion model](https://github.com/CompVis/latent-diffusion), it has since been applied to other model variants like [Stable Diffusion](https://huggingface.co/docs/diffusers/main/en/conceptual/stable_diffusion). The learned concepts can be used to better control the images generated from text-to-image pipelines. It learns new "words" in the text encoder's embedding space, which are used within text prompts for personalized image generation.
|
||||
|
||||

|
||||
<small>By using just 3-5 images you can teach new concepts to a model such as Stable Diffusion for personalized image generation <a href="https://github.com/rinongal/textual_inversion">(image source)</a></small>
|
||||
|
||||
This guide will show you how to train a [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) model with Textual Inversion. All the training scripts for Textual Inversion used in this guide can be found [here](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) if you're interested in taking a closer look at how things work under the hood.
|
||||
|
||||
<Tip>
|
||||
|
||||
There is a community-created collection of trained Textual Inversion models in the [Stable Diffusion Textual Inversion Concepts Library](https://huggingface.co/sd-concepts-library) which are readily available for inference. Over time, this'll hopefully grow into a useful resource as more concepts are added!
|
||||
|
||||
</Tip>
|
||||
|
||||
Before you begin, make sure you install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install diffusers accelerate transformers
|
||||
```
|
||||
|
||||
After all the dependencies have been set up, initialize a [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
To setup a default 🤗 Accelerate environment without choosing any configurations:
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
Finally, you try and [install xFormers](https://huggingface.co/docs/diffusers/main/en/training/optimization/xformers) to reduce your memory footprint with xFormers memory-efficient attention. Once you have xFormers installed, add the `--enable_xformers_memory_efficient_attention` argument to the training script. xFormers is not supported for Flax.
|
||||
|
||||
## Upload model to Hub
|
||||
|
||||
If you want to store your model on the Hub, add the following argument to the training script:
|
||||
|
||||
```bash
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
## Save and load checkpoints
|
||||
|
||||
It is often a good idea to regularly save checkpoints of your model during training. This way, you can resume training from a saved checkpoint if your training is interrupted for any reason. To save a checkpoint, pass the following argument to the training script to save the full training state in a subfolder in `output_dir` every 500 steps:
|
||||
|
||||
```bash
|
||||
--checkpointing_steps=500
|
||||
```
|
||||
|
||||
To resume training from a saved checkpoint, pass the following argument to the training script and the specific checkpoint you'd like to resume from:
|
||||
|
||||
```bash
|
||||
--resume_from_checkpoint="checkpoint-1500"
|
||||
```
|
||||
|
||||
## Finetuning
|
||||
|
||||
For your training dataset, download these [images of a cat statue](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and store them in a directory.
|
||||
|
||||
Set the `MODEL_NAME` environment variable to the model repository id, and the `DATA_DIR` environment variable to the path of the directory containing the images. Now you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py):
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 A full training run takes ~1 hour on one V100 GPU. While you're waiting for the training to complete, feel free to check out [how Textual Inversion works](#how-it-works) in the section below if you're curious!
|
||||
|
||||
</Tip>
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export DATA_DIR="path-to-dir-containing-images"
|
||||
|
||||
accelerate launch textual_inversion.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$DATA_DIR \
|
||||
--learnable_property="object" \
|
||||
--placeholder_token="<cat-toy>" --initializer_token="toy" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--max_train_steps=3000 \
|
||||
--learning_rate=5.0e-04 --scale_lr \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--output_dir="textual_inversion_cat"
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
If you have access to TPUs, try out the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion_flax.py) to train even faster (this'll also work for GPUs). With the same configuration settings, the Flax training script should be at least 70% faster than the PyTorch training script! ⚡️
|
||||
|
||||
Before you begin, make sure you install the Flax specific dependencies:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements_flax.txt
|
||||
```
|
||||
|
||||
Then you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion_flax.py):
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export DATA_DIR="path-to-dir-containing-images"
|
||||
|
||||
python textual_inversion_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$DATA_DIR \
|
||||
--learnable_property="object" \
|
||||
--placeholder_token="<cat-toy>" --initializer_token="toy" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=3000 \
|
||||
--learning_rate=5.0e-04 --scale_lr \
|
||||
--output_dir="textual_inversion_cat"
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
### Intermediate logging
|
||||
|
||||
If you're interested in following along with your model training progress, you can save the generated images from the training process. Add the following arguments to the training script to enable intermediate logging:
|
||||
|
||||
- `validation_prompt`, the prompt used to generate samples (this is set to `None` by default and intermediate logging is disabled)
|
||||
- `num_validation_images`, the number of sample images to generate
|
||||
- `validation_steps`, the number of steps before generating `num_validation_images` from the `validation_prompt`
|
||||
|
||||
```bash
|
||||
--validation_prompt="A <cat-toy> backpack"
|
||||
--num_validation_images=4
|
||||
--validation_steps=100
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
Once you have trained a model, you can use it for inference with the [`StableDiffusionPipeline]. Make sure you include the `placeholder_token` in your prompt, in this case, it is `<cat-toy>`.
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
model_id = "path-to-your-trained-model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "A <cat-toy> backpack"
|
||||
|
||||
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
|
||||
image.save("cat-backpack.png")
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
model_path = "path-to-your-trained-model"
|
||||
pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "A <cat-toy> backpack"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("cat-backpack.png")
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## How it works
|
||||
|
||||

|
||||
<small>Architecture overview from the Textual Inversion <a href="https://textual-inversion.github.io/">blog post.</a></small>
|
||||
|
||||
Usually, text prompts are tokenized into an embedding before being passed to a model, which is often a transformer. Textual Inversion does something similar, but it learns a new token embedding, `v*`, from a special token `S*` in the diagram above. The model output is used to condition the diffusion model, which helps the diffusion model understand the prompt and new concepts from just a few example images.
|
||||
|
||||
To do this, Textual Inversion uses a generator model and noisy versions of the training images. The generator tries to predict less noisy versions of the images, and the token embedding `v*` is optimized based on how well the generator does. If the token embedding successfully captures the new concept, it gives more useful information to the diffusion model and helps create clearer images with less noise. This optimization process typically occurs after several thousand steps of exposure to a variety of prompt and image variants.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user