Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dc8856e2e1 |
@@ -1,7 +1,4 @@
|
||||
contact_links:
|
||||
- name: Blank issue
|
||||
url: https://github.com/huggingface/diffusers/issues/new
|
||||
about: Other
|
||||
- name: Forum
|
||||
url: https://discuss.huggingface.co/
|
||||
about: General usage questions and community discussions
|
||||
about: General usage questions and community discussions
|
||||
|
||||
@@ -13,7 +13,6 @@ jobs:
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: diffusers
|
||||
notebook_folder: diffusers_doc
|
||||
languages: en ko
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
|
||||
@@ -27,8 +27,9 @@ jobs:
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
black --check examples tests src utils scripts
|
||||
ruff examples tests src utils scripts
|
||||
black --check --preview examples tests src utils scripts
|
||||
isort --check-only examples tests src utils scripts
|
||||
flake8 examples tests src utils scripts
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
|
||||
check_repository_consistency:
|
||||
@@ -47,4 +48,3 @@ jobs:
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
|
||||
@@ -36,11 +36,6 @@ jobs:
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
@@ -95,13 +90,6 @@ jobs:
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples/test_examples.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
name: Slow tests on main
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: no
|
||||
|
||||
jobs:
|
||||
run_fast_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Fast PyTorch CPU tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
- name: Fast Flax CPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: Fast ONNXRuntime CPU tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast ONNXRuntime CPU tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples/test_examples.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
run_fast_tests_apple_m1:
|
||||
name: Fast PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Clean checkout
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
git clean -fxd
|
||||
|
||||
- name: Setup miniconda
|
||||
uses: ./.github/actions/setup-miniconda
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
|
||||
${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch tests on M1 (MPS)
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_mps_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_torch_mps_test_reports
|
||||
path: reports
|
||||
@@ -169,8 +169,3 @@ tags
|
||||
|
||||
# dependencies
|
||||
/transformers
|
||||
|
||||
# ruff
|
||||
.ruff_cache
|
||||
|
||||
wandb
|
||||
@@ -1,40 +0,0 @@
|
||||
cff-version: 1.2.0
|
||||
title: 'Diffusers: State-of-the-art diffusion models'
|
||||
message: >-
|
||||
If you use this software, please cite it using the
|
||||
metadata from this file.
|
||||
type: software
|
||||
authors:
|
||||
- given-names: Patrick
|
||||
family-names: von Platen
|
||||
- given-names: Suraj
|
||||
family-names: Patil
|
||||
- given-names: Anton
|
||||
family-names: Lozhkov
|
||||
- given-names: Pedro
|
||||
family-names: Cuenca
|
||||
- given-names: Nathan
|
||||
family-names: Lambert
|
||||
- given-names: Kashif
|
||||
family-names: Rasul
|
||||
- given-names: Mishig
|
||||
family-names: Davaadorj
|
||||
- given-names: Thomas
|
||||
family-names: Wolf
|
||||
repository-code: 'https://github.com/huggingface/diffusers'
|
||||
abstract: >-
|
||||
Diffusers provides pretrained diffusion models across
|
||||
multiple modalities, such as vision and audio, and serves
|
||||
as a modular toolbox for inference and training of
|
||||
diffusion models.
|
||||
keywords:
|
||||
- deep-learning
|
||||
- pytorch
|
||||
- image-generation
|
||||
- diffusion
|
||||
- text2image
|
||||
- image2image
|
||||
- score-based-generative-modeling
|
||||
- stable-diffusion
|
||||
license: Apache-2.0
|
||||
version: 0.12.1
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
<!---
|
||||
Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
|
||||
$ make style
|
||||
```
|
||||
|
||||
🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
|
||||
🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -9,8 +9,9 @@ modified_only_fixup:
|
||||
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
|
||||
@if test -n "$(modified_py_files)"; then \
|
||||
echo "Checking/fixing $(modified_py_files)"; \
|
||||
black $(modified_py_files); \
|
||||
ruff $(modified_py_files); \
|
||||
black --preview $(modified_py_files); \
|
||||
isort $(modified_py_files); \
|
||||
flake8 $(modified_py_files); \
|
||||
else \
|
||||
echo "No library .py files were modified"; \
|
||||
fi
|
||||
@@ -40,8 +41,9 @@ repo-consistency:
|
||||
# this target runs checks on all files
|
||||
|
||||
quality:
|
||||
black --check $(check_dirs)
|
||||
ruff $(check_dirs)
|
||||
black --check --preview $(check_dirs)
|
||||
isort --check-only $(check_dirs)
|
||||
flake8 $(check_dirs)
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
@@ -55,8 +57,8 @@ extra_style_checks:
|
||||
# this target runs checks on all files and potentially modifies some of them
|
||||
|
||||
style:
|
||||
black $(check_dirs)
|
||||
ruff $(check_dirs) --fix
|
||||
black --preview $(check_dirs)
|
||||
isort $(check_dirs)
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
||||
@@ -15,138 +15,481 @@
|
||||
</a>
|
||||
</p>
|
||||
|
||||
🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](https://huggingface.co/docs/diffusers/conceptual/philosophy#usability-over-performance), [simple over easy](https://huggingface.co/docs/diffusers/conceptual/philosophy#simple-over-easy), and [customizability over abstractions](https://huggingface.co/docs/diffusers/conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
|
||||
🤗 Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves
|
||||
as a modular toolbox for inference and training of diffusion models.
|
||||
|
||||
🤗 Diffusers offers three core components:
|
||||
More precisely, 🤗 Diffusers offers:
|
||||
|
||||
- State-of-the-art [diffusion pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) that can be run in inference with just a few lines of code.
|
||||
- Interchangeable noise [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview) for different diffusion speeds and output quality.
|
||||
- Pretrained [models](https://huggingface.co/docs/diffusers/api/models) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
|
||||
- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)). Check [this overview](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/README.md#pipelines-summary) to see all supported pipelines and their corresponding official papers.
|
||||
- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference (see [src/diffusers/schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers)).
|
||||
- Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system (see [src/diffusers/models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models)).
|
||||
- Training examples to show how to train the most popular diffusion model tasks (see [examples](https://github.com/huggingface/diffusers/tree/main/examples), *e.g.* [unconditional-image-generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation)).
|
||||
|
||||
## Installation
|
||||
|
||||
We recommend installing 🤗 Diffusers in a virtual environment from PyPi or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/installation.html), please refer to their official documentation.
|
||||
### For PyTorch
|
||||
|
||||
### PyTorch
|
||||
|
||||
With `pip` (official package):
|
||||
**With `pip`** (official package)
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers[torch]
|
||||
```
|
||||
|
||||
With `conda` (maintained by the community):
|
||||
**With `conda`** (maintained by the community)
|
||||
|
||||
```sh
|
||||
conda install -c conda-forge diffusers
|
||||
```
|
||||
|
||||
### Flax
|
||||
### For Flax
|
||||
|
||||
With `pip` (official package):
|
||||
**With `pip`**
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers[flax]
|
||||
```
|
||||
|
||||
### Apple Silicon (M1/M2) support
|
||||
**Apple Silicon (M1/M2) support**
|
||||
|
||||
Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggingface.co/docs/diffusers/optimization/mps) guide.
|
||||
Please, refer to [the documentation](https://huggingface.co/docs/diffusers/optimization/mps).
|
||||
|
||||
## Contributing
|
||||
|
||||
We ❤️ contributions from the open-source community!
|
||||
If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md).
|
||||
You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library.
|
||||
- See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute
|
||||
- See [New model/pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) to contribute exciting new diffusion models / diffusion pipelines
|
||||
- See [New scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
|
||||
|
||||
Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or
|
||||
just hang out ☕.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 4000+ checkpoints):
|
||||
In order to get started, we recommend taking a look at two notebooks:
|
||||
|
||||
- The [Getting started with Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) notebook, which showcases an end-to-end example of usage for diffusion models, schedulers and pipelines.
|
||||
Take a look at this notebook to learn how to use the pipeline abstraction, which takes care of everything (model, scheduler, noise handling) for you, and also to understand each independent building block in the library.
|
||||
- The [Training a diffusers model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook summarizes diffusion models training methods. This notebook takes a step-by-step approach to training your
|
||||
diffusion models on an image dataset, with explanatory graphics.
|
||||
|
||||
## Stable Diffusion is fully compatible with `diffusers`!
|
||||
|
||||
Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/) and [RunwayML](https://runwayml.com/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 4GB VRAM.
|
||||
See the [model card](https://huggingface.co/CompVis/stable-diffusion) for more information.
|
||||
|
||||
|
||||
### Text-to-Image generation with Stable Diffusion
|
||||
|
||||
First let's install
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers transformers accelerate
|
||||
```
|
||||
|
||||
We recommend using the model in [half-precision (`fp16`)](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) as it gives almost always the same results as full
|
||||
precision while being roughly twice as fast and requiring half the amount of GPU RAM.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
#### Running the model locally
|
||||
|
||||
You can also simply download the model folder and pass the path to the local folder to the `StableDiffusionPipeline`.
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
Assuming the folder is stored locally under `./stable-diffusion-v1-5`, you can run stable diffusion
|
||||
as follows:
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
If you are limited by GPU memory, you might want to consider chunking the attention computation in addition
|
||||
to using `fp16`.
|
||||
The following snippet should result in less than 4GB VRAM.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_attention_slicing()
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
If you wish to use a different scheduler (e.g.: DDIM, LMS, PNDM/PLMS), you can instantiate
|
||||
it before the pipeline and pass it to `from_pretrained`.
|
||||
|
||||
```python
|
||||
from diffusers import LMSDiscreteScheduler
|
||||
|
||||
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
|
||||
image.save("astronaut_rides_horse.png")
|
||||
```
|
||||
|
||||
If you want to run Stable Diffusion on CPU or you want to have maximum precision on GPU,
|
||||
please run the model in the default *full-precision* setting:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
|
||||
# disable the following line if you run on CPU
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
|
||||
image.save("astronaut_rides_horse.png")
|
||||
```
|
||||
|
||||
### JAX/Flax
|
||||
|
||||
Diffusers offers a JAX / Flax implementation of Stable Diffusion for very fast inference. JAX shines specially on TPU hardware because each TPU server has 8 accelerators working in parallel, but it runs great on GPUs too.
|
||||
|
||||
Running the pipeline with the default PNDMScheduler:
|
||||
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="flax", dtype=jax.numpy.bfloat16
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
**Note**:
|
||||
If you are limited by TPU memory, please make sure to load the `FlaxStableDiffusionPipeline` in `bfloat16` precision instead of the default `float32` precision as done above. You can do so by telling diffusers to load the weights from "bf16" branch.
|
||||
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jax.numpy.bfloat16
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
Diffusers also has a Image-to-Image generation pipeline with Flax/Jax
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
import jax.numpy as jnp
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
from diffusers import FlaxStableDiffusionImg2ImgPipeline
|
||||
|
||||
def create_key(seed=0):
|
||||
return jax.random.PRNGKey(seed)
|
||||
rng = create_key(0)
|
||||
|
||||
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
response = requests.get(url)
|
||||
init_img = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_img = init_img.resize((768, 512))
|
||||
|
||||
prompts = "A fantasy landscape, trending on artstation"
|
||||
|
||||
pipeline, params = FlaxStableDiffusionImg2ImgPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", revision="flax",
|
||||
dtype=jnp.bfloat16,
|
||||
)
|
||||
|
||||
num_samples = jax.device_count()
|
||||
rng = jax.random.split(rng, jax.device_count())
|
||||
prompt_ids, processed_image = pipeline.prepare_inputs(prompt=[prompts]*num_samples, image = [init_img]*num_samples)
|
||||
p_params = replicate(params)
|
||||
prompt_ids = shard(prompt_ids)
|
||||
processed_image = shard(processed_image)
|
||||
|
||||
output = pipeline(
|
||||
prompt_ids=prompt_ids,
|
||||
image=processed_image,
|
||||
params=p_params,
|
||||
prng_seed=rng,
|
||||
strength=0.75,
|
||||
num_inference_steps=50,
|
||||
jit=True,
|
||||
height=512,
|
||||
width=768).images
|
||||
|
||||
output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
|
||||
```
|
||||
|
||||
### Image-to-Image text-guided generation with Stable Diffusion
|
||||
|
||||
The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
|
||||
# load the pipeline
|
||||
device = "cuda"
|
||||
model_id_or_path = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
|
||||
|
||||
# or download via git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
# and pass `model_id_or_path="./stable-diffusion-v1-5"`.
|
||||
pipe = pipe.to(device)
|
||||
|
||||
# let's download an initial image
|
||||
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
You can also run this example on colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
|
||||
### In-painting using Stable Diffusion
|
||||
|
||||
The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and a text prompt.
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from io import BytesIO
|
||||
|
||||
from diffusers import StableDiffusionInpaintPipeline
|
||||
|
||||
def download_image(url):
|
||||
response = requests.get(url)
|
||||
return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
|
||||
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
```
|
||||
|
||||
### Tweak prompts reusing seeds and latents
|
||||
|
||||
You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked.
|
||||
Please have a look at [Reusing seeds for deterministic generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/reusing_seeds).
|
||||
|
||||
## Fine-Tuning Stable Diffusion
|
||||
|
||||
Fine-tuning techniques make it possible to adapt Stable Diffusion to your own dataset, or add new subjects to it. These are some of the techniques supported in `diffusers`:
|
||||
|
||||
Textual Inversion is a technique for capturing novel concepts from a small number of example images in a way that can later be used to control text-to-image pipelines. It does so by learning new 'words' in the embedding space of the pipeline's text encoder. These special words can then be used within text prompts to achieve very fine-grained control of the resulting images.
|
||||
|
||||
- Textual Inversion. Capture novel concepts from a small set of sample images, and associate them with new "words" in the embedding space of the text encoder. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) or [documentation](https://huggingface.co/docs/diffusers/training/text_inversion) to try for yourself.
|
||||
|
||||
- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training example](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://huggingface.co/blog/dreambooth) for additional details and training recommendations.
|
||||
|
||||
- Full Stable Diffusion fine-tuning. If you have a more sizable dataset with a specific look or style, you can fine-tune Stable Diffusion so that it outputs images following those examples. This was the approach taken to create [a Pokémon Stable Diffusion model](https://huggingface.co/justinpinkney/pokemon-stable-diffusion) (by Justing Pinkney / Lambda Labs), [a Japanese specific version of Stable Diffusion](https://huggingface.co/spaces/rinna/japanese-stable-diffusion) (by [Rinna Co.](https://github.com/rinnakk/japanese-stable-diffusion/) and others. You can start at [our text-to-image fine-tuning example](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) and go from there.
|
||||
|
||||
|
||||
## Stable Diffusion Community Pipelines
|
||||
|
||||
The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation.
|
||||
Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! [Take a look](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/contribute_pipeline).
|
||||
|
||||
## Other Examples
|
||||
|
||||
There are many ways to try running Diffusers! Here we outline code-focused tools (primarily using `DiffusionPipeline`s and Google Colab) and interactive web-tools.
|
||||
|
||||
### Running Code
|
||||
|
||||
If you want to run the code yourself 💻, you can try out:
|
||||
- [Text-to-Image Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256)
|
||||
```python
|
||||
# !pip install diffusers["torch"] transformers
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipeline.to("cuda")
|
||||
pipeline("An image of a squirrel in Picasso style").images[0]
|
||||
device = "cuda"
|
||||
model_id = "CompVis/ldm-text2im-large-256"
|
||||
|
||||
# load model and scheduler
|
||||
ldm = DiffusionPipeline.from_pretrained(model_id)
|
||||
ldm = ldm.to(device)
|
||||
|
||||
# run pipeline in inference (sample random noise and denoise)
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
image = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images[0]
|
||||
|
||||
# save image
|
||||
image.save("squirrel.png")
|
||||
```
|
||||
|
||||
You can also dig into the models and schedulers toolbox to build your own diffusion system:
|
||||
|
||||
- [Unconditional Diffusion with discrete scheduler](https://huggingface.co/google/ddpm-celebahq-256)
|
||||
```python
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
from PIL import Image
|
||||
import torch
|
||||
import numpy as np
|
||||
# !pip install diffusers["torch"]
|
||||
from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline
|
||||
|
||||
scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
|
||||
model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
|
||||
scheduler.set_timesteps(50)
|
||||
model_id = "google/ddpm-celebahq-256"
|
||||
device = "cuda"
|
||||
|
||||
sample_size = model.config.sample_size
|
||||
noise = torch.randn((1, 3, sample_size, sample_size)).to("cuda")
|
||||
input = noise
|
||||
# load model and scheduler
|
||||
ddpm = DDPMPipeline.from_pretrained(model_id) # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference
|
||||
ddpm.to(device)
|
||||
|
||||
for t in scheduler.timesteps:
|
||||
with torch.no_grad():
|
||||
noisy_residual = model(input, t).sample
|
||||
prev_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
|
||||
input = prev_noisy_sample
|
||||
# run pipeline in inference (sample random noise and denoise)
|
||||
image = ddpm().images[0]
|
||||
|
||||
image = (input / 2 + 0.5).clamp(0, 1)
|
||||
image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
|
||||
image = Image.fromarray((image * 255).round().astype("uint8"))
|
||||
image
|
||||
# save image
|
||||
image.save("ddpm_generated_image.png")
|
||||
```
|
||||
- [Unconditional Latent Diffusion](https://huggingface.co/CompVis/ldm-celebahq-256)
|
||||
- [Unconditional Diffusion with continuous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)
|
||||
|
||||
Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to launch your diffusion journey today!
|
||||
**Other Image Notebooks**:
|
||||
* [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ,
|
||||
* [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ,
|
||||
|
||||
## How to navigate the documentation
|
||||
**Diffusers for Other Modalities**:
|
||||
* [Molecule conformation generation](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) ,
|
||||
* [Model-based reinforcement learning](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ,
|
||||
|
||||
| **Documentation** | **What can I learn?** |
|
||||
|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| Tutorial | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. |
|
||||
| Loading | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
|
||||
| Pipelines for inference | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
|
||||
| Optimization | Guides for how to optimize your diffusion model to run faster and consume less memory. |
|
||||
| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. |
|
||||
### Web Demos
|
||||
If you just want to play around with some web demos, you can try out the following 🚀 Spaces:
|
||||
| Model | Hugging Face Spaces |
|
||||
|-------------------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Text-to-Image Latent Diffusion | [](https://huggingface.co/spaces/CompVis/text2img-latent-diffusion) |
|
||||
| Faces generator | [](https://huggingface.co/spaces/CompVis/celeba-latent-diffusion) |
|
||||
| DDPM with different schedulers | [](https://huggingface.co/spaces/fusing/celeba-diffusion) |
|
||||
| Conditional generation from sketch | [](https://huggingface.co/spaces/huggingface/diffuse-the-rest) |
|
||||
| Composable diffusion | [](https://huggingface.co/spaces/Shuang59/Composable-Diffusion) |
|
||||
|
||||
## Supported pipelines
|
||||
## Definitions
|
||||
|
||||
| Pipeline | Paper | Tasks |
|
||||
|---|---|:---:|
|
||||
| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
|
||||
| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation |
|
||||
| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
|
||||
| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
|
||||
| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
|
||||
| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
|
||||
| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation |
|
||||
| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation |
|
||||
| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
|
||||
| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing|
|
||||
| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
|
||||
| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation |
|
||||
| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation |
|
||||
| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
|
||||
| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation |
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
|
||||
| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
|
||||
| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
**Models**: Neural network that models $p_\theta(\mathbf{x}_{t-1}|\mathbf{x}_t)$ (see image below) and is trained end-to-end to *denoise* a noisy input to an image.
|
||||
*Examples*: UNet, Conditioned UNet, 3D UNet, Transformer UNet
|
||||
|
||||
<p align="center">
|
||||
<img src="https://user-images.githubusercontent.com/10695622/174349667-04e9e485-793b-429a-affe-096e8199ad5b.png" width="800"/>
|
||||
<br>
|
||||
<em> Figure from DDPM paper (https://arxiv.org/abs/2006.11239). </em>
|
||||
<p>
|
||||
|
||||
**Schedulers**: Algorithm class for both **inference** and **training**.
|
||||
The class provides functionality to compute previous image according to alpha, beta schedule as well as predict noise for training. Also known as **Samplers**.
|
||||
*Examples*: [DDPM](https://arxiv.org/abs/2006.11239), [DDIM](https://arxiv.org/abs/2010.02502), [PNDM](https://arxiv.org/abs/2202.09778), [DEIS](https://arxiv.org/abs/2204.13902)
|
||||
|
||||
<p align="center">
|
||||
<img src="https://user-images.githubusercontent.com/10695622/174349706-53d58acc-a4d1-4cda-b3e8-432d9dc7ad38.png" width="800"/>
|
||||
<br>
|
||||
<em> Sampling and training algorithms. Figure from DDPM paper (https://arxiv.org/abs/2006.11239). </em>
|
||||
<p>
|
||||
|
||||
|
||||
**Diffusion Pipeline**: End-to-end pipeline that includes multiple diffusion models, possible text encoders, ...
|
||||
*Examples*: Glide, Latent-Diffusion, Imagen, DALL-E 2
|
||||
|
||||
<p align="center">
|
||||
<img src="https://user-images.githubusercontent.com/10695622/174348898-481bd7c2-5457-4830-89bc-f0907756f64c.jpeg" width="550"/>
|
||||
<br>
|
||||
<em> Figure from ImageGen (https://imagen.research.google/). </em>
|
||||
<p>
|
||||
|
||||
## Philosophy
|
||||
|
||||
- Readability and clarity is preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and provide well-commented code that can be read alongside the original paper.
|
||||
- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio.
|
||||
- Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementation and can include components of another library, such as text-encoders. Examples for diffusion pipelines are [Glide](https://github.com/openai/glide-text2im) and [Latent Diffusion](https://github.com/CompVis/latent-diffusion).
|
||||
|
||||
## In the works
|
||||
|
||||
For the first release, 🤗 Diffusers focuses on text-to-image diffusion techniques. However, diffusers can be used for much more than that! Over the upcoming releases, we'll be focusing on:
|
||||
|
||||
- Diffusers for audio
|
||||
- Diffusers for reinforcement learning (initial work happening in https://github.com/huggingface/diffusers/pull/105).
|
||||
- Diffusers for video generation
|
||||
- Diffusers for molecule generation (initial work happening in https://github.com/huggingface/diffusers/pull/54)
|
||||
|
||||
A few pipeline components are already being worked on, namely:
|
||||
|
||||
- BDDMPipeline for spectrogram-to-sound vocoding
|
||||
- GLIDEPipeline to support OpenAI's GLIDE model
|
||||
- Grad-TTS for text to audio generation / conditional audio generation
|
||||
|
||||
We want diffusers to be a toolbox useful for diffusers models in general; if you find yourself limited in any way by the current API, or would like to see additional models, schedulers, or techniques, please open a [GitHub issue](https://github.com/huggingface/diffusers/issues) mentioning what you would like to see.
|
||||
|
||||
## Credits
|
||||
|
||||
@@ -154,7 +497,7 @@ This library concretizes previous work by many different authors and would not h
|
||||
|
||||
- @CompVis' latent diffusion models library, available [here](https://github.com/CompVis/latent-diffusion)
|
||||
- @hojonathanho original DDPM implementation, available [here](https://github.com/hojonathanho/diffusion) as well as the extremely useful translation into PyTorch by @pesser, available [here](https://github.com/pesser/pytorch_diffusion)
|
||||
- @ermongroup's DDIM implementation, available [here](https://github.com/ermongroup/ddim)
|
||||
- @ermongroup's DDIM implementation, available [here](https://github.com/ermongroup/ddim).
|
||||
- @yang-song's Score-VE and Score-VP implementations, available [here](https://github.com/yang-song/score_sde_pytorch)
|
||||
|
||||
We also want to thank @heejkoo for the very helpful overview of papers, code and resources on diffusion models, available [here](https://github.com/heejkoo/Awesome-Diffusion-Models) as well as @crowsonkb and @rromb for useful discussions and insights.
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -36,8 +36,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -34,8 +34,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -33,8 +33,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
|
||||
@@ -27,16 +27,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
CMD ["/bin/bash"]
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
<!---
|
||||
Copyright 2023- The HuggingFace Team. All rights reserved.
|
||||
Copyright 2022- The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
# docstyle-ignore
|
||||
INSTALL_CONTENT = """
|
||||
# Diffusers installation
|
||||
! pip install diffusers transformers datasets accelerate
|
||||
# To install from source instead of the last release, comment the command above and uncomment the following one.
|
||||
# ! pip install git+https://github.com/huggingface/diffusers.git
|
||||
"""
|
||||
|
||||
notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
|
||||
+17
-73
@@ -8,30 +8,18 @@
|
||||
- local: installation
|
||||
title: Installation
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: tutorials/tutorial_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/write_own_pipeline
|
||||
title: Understanding models and schedulers
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- sections:
|
||||
- local: using-diffusers/loading_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines, models, and schedulers
|
||||
title: Loading Pipelines, Models, and Schedulers
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load and compare different schedulers
|
||||
title: Using different Schedulers
|
||||
- local: using-diffusers/configuration
|
||||
title: Configuring Pipelines, Models, and Schedulers
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load and add custom pipelines
|
||||
- local: using-diffusers/kerascv
|
||||
title: Load KerasCV Stable Diffusion checkpoints
|
||||
title: Loading and Adding Custom Pipelines
|
||||
title: Loading & Hub
|
||||
- sections:
|
||||
- local: using-diffusers/pipeline_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional Image Generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
@@ -44,31 +32,11 @@
|
||||
title: Text-Guided Depth-to-Image
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reusing seeds for deterministic generation
|
||||
- local: using-diffusers/reproducibility
|
||||
title: Reproducibility
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: Community Pipelines
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: How to contribute a Pipeline
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Using safetensors
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Weighting Prompts
|
||||
title: Pipelines for Inference
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/lora
|
||||
title: Low-Rank Adaptation of Large Language Models (LoRA)
|
||||
title: Training
|
||||
- sections:
|
||||
- local: using-diffusers/rl
|
||||
title: Reinforcement Learning
|
||||
@@ -79,12 +47,8 @@
|
||||
title: Taking Diffusers Beyond Images
|
||||
title: Using Diffusers
|
||||
- sections:
|
||||
- local: optimization/opt_overview
|
||||
title: Overview
|
||||
- local: optimization/fp16
|
||||
title: Memory and Speed
|
||||
- local: optimization/torch2.0
|
||||
title: Torch2.0 support
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/onnx
|
||||
@@ -96,17 +60,23 @@
|
||||
- local: optimization/habana
|
||||
title: Habana Gaudi
|
||||
title: Optimization/Special Hardware
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional Image Generation
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: Dreambooth
|
||||
- local: training/text2image
|
||||
title: Text-to-image fine-tuning
|
||||
title: Training
|
||||
- sections:
|
||||
- local: conceptual/philosophy
|
||||
title: Philosophy
|
||||
- local: using-diffusers/controlling_generation
|
||||
title: Controlled generation
|
||||
- local: conceptual/contribution
|
||||
title: How to contribute?
|
||||
- local: conceptual/ethical_guidelines
|
||||
title: Diffusers' Ethical Guidelines
|
||||
- local: conceptual/evaluation
|
||||
title: Evaluating Diffusion Models
|
||||
title: Conceptual Guides
|
||||
- sections:
|
||||
- sections:
|
||||
@@ -120,8 +90,6 @@
|
||||
title: Configuration
|
||||
- local: api/outputs
|
||||
title: Outputs
|
||||
- local: api/loaders
|
||||
title: Loaders
|
||||
title: Main Classes
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
@@ -138,8 +106,6 @@
|
||||
title: DDIM
|
||||
- local: api/pipelines/ddpm
|
||||
title: DDPM
|
||||
- local: api/pipelines/dit
|
||||
title: DiT
|
||||
- local: api/pipelines/latent_diffusion
|
||||
title: Latent Diffusion
|
||||
- local: api/pipelines/paint_by_example
|
||||
@@ -152,8 +118,6 @@
|
||||
title: Safe Stable Diffusion
|
||||
- local: api/pipelines/score_sde_ve
|
||||
title: Score SDE VE
|
||||
- local: api/pipelines/semantic_stable_diffusion
|
||||
title: Semantic Guidance
|
||||
- sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: Overview
|
||||
@@ -169,25 +133,9 @@
|
||||
title: Image-Variation
|
||||
- local: api/pipelines/stable_diffusion/upscale
|
||||
title: Super-Resolution
|
||||
- local: api/pipelines/stable_diffusion/latent_upscale
|
||||
title: Stable-Diffusion-Latent-Upscaler
|
||||
- local: api/pipelines/stable_diffusion/pix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: api/pipelines/stable_diffusion/attend_and_excite
|
||||
title: Attend and Excite
|
||||
- local: api/pipelines/stable_diffusion/pix2pix_zero
|
||||
title: Pix2Pix Zero
|
||||
- local: api/pipelines/stable_diffusion/self_attention_guidance
|
||||
title: Self-Attention Guidance
|
||||
- local: api/pipelines/stable_diffusion/panorama
|
||||
title: MultiDiffusion Panorama
|
||||
- local: api/pipelines/stable_diffusion/controlnet
|
||||
title: Text-to-Image Generation with ControlNet Conditioning
|
||||
title: Stable Diffusion
|
||||
- local: api/pipelines/stable_diffusion_2
|
||||
title: Stable Diffusion 2
|
||||
- local: api/pipelines/stable_unclip
|
||||
title: Stable unCLIP
|
||||
- local: api/pipelines/stochastic_karras_ve
|
||||
title: Stochastic Karras VE
|
||||
- local: api/pipelines/unclip
|
||||
@@ -204,8 +152,6 @@
|
||||
title: Overview
|
||||
- local: api/schedulers/ddim
|
||||
title: DDIM
|
||||
- local: api/schedulers/ddim_inverse
|
||||
title: DDIMInverse
|
||||
- local: api/schedulers/ddpm
|
||||
title: DDPM
|
||||
- local: api/schedulers/deis
|
||||
@@ -234,8 +180,6 @@
|
||||
title: Singlestep DPM-Solver
|
||||
- local: api/schedulers/stochastic_karras_ve
|
||||
title: Stochastic Kerras VE
|
||||
- local: api/schedulers/unipc
|
||||
title: UniPCMultistepScheduler
|
||||
- local: api/schedulers/score_sde_ve
|
||||
title: VE-SDE
|
||||
- local: api/schedulers/score_sde_vp
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -12,8 +12,8 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Configuration
|
||||
|
||||
Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all the parameters that are
|
||||
passed to their respective `__init__` methods in a JSON-configuration file.
|
||||
In Diffusers, schedulers of type [`schedulers.scheduling_utils.SchedulerMixin`], and models of type [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all parameters that are
|
||||
passed to the respective `__init__` methods in a JSON-configuration file.
|
||||
|
||||
## ConfigMixin
|
||||
|
||||
@@ -21,5 +21,3 @@ passed to their respective `__init__` methods in a JSON-configuration file.
|
||||
- load_config
|
||||
- from_config
|
||||
- save_config
|
||||
- to_json_file
|
||||
- to_json_string
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -34,7 +34,6 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain
|
||||
- __call__
|
||||
- device
|
||||
- to
|
||||
- components
|
||||
|
||||
## ImagePipelineOutput
|
||||
By default diffusion pipelines return an object of class
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Loaders
|
||||
|
||||
There are many ways to train adapter neural networks for diffusion models, such as
|
||||
- [Textual Inversion](./training/text_inversion.mdx)
|
||||
- [LoRA](https://github.com/cloneofsimo/lora)
|
||||
- [Hypernetworks](https://arxiv.org/abs/1609.09106)
|
||||
|
||||
Such adapter neural networks often only consist of a fraction of the number of weights compared
|
||||
to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use
|
||||
API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py).
|
||||
|
||||
**Note**: This module is still highly experimental and prone to future changes.
|
||||
|
||||
## LoaderMixins
|
||||
|
||||
### UNet2DConditionLoadersMixin
|
||||
|
||||
[[autodoc]] loaders.UNet2DConditionLoadersMixin
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -64,12 +64,6 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
|
||||
## PriorTransformerOutput
|
||||
[[autodoc]] models.prior_transformer.PriorTransformerOutput
|
||||
|
||||
## ControlNetOutput
|
||||
[[autodoc]] models.controlnet.ControlNetOutput
|
||||
|
||||
## ControlNetModel
|
||||
[[autodoc]] ControlNetModel
|
||||
|
||||
## FlaxModelMixin
|
||||
[[autodoc]] FlaxModelMixin
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Scalable Diffusion Models with Transformers (DiT)
|
||||
|
||||
## Overview
|
||||
|
||||
[Scalable Diffusion Models with Transformers](https://arxiv.org/abs/2212.09748) (DiT) by William Peebles and Saining Xie.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We explore a new class of diffusion models based on the transformer architecture. We train latent diffusion models of images, replacing the commonly-used U-Net backbone with a transformer that operates on latent patches. We analyze the scalability of our Diffusion Transformers (DiTs) through the lens of forward pass complexity as measured by Gflops. We find that DiTs with higher Gflops -- through increased transformer depth/width or increased number of input tokens -- consistently have lower FID. In addition to possessing good scalability properties, our largest DiT-XL/2 models outperform all prior diffusion models on the class-conditional ImageNet 512x512 and 256x256 benchmarks, achieving a state-of-the-art FID of 2.27 on the latter.*
|
||||
|
||||
The original codebase of this paper can be found here: [facebookresearch/dit](https://github.com/facebookresearch/dit).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab
|
||||
|---|---|:---:|
|
||||
| [pipeline_dit.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dit/pipeline_dit.py) | *Conditional Image Generation* | - |
|
||||
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
from diffusers import DiTPipeline, DPMSolverMultistepScheduler
|
||||
import torch
|
||||
|
||||
pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
# pick words from Imagenet class labels
|
||||
pipe.labels # to print all available words
|
||||
|
||||
# pick words that exist in ImageNet
|
||||
words = ["white shark", "umbrella"]
|
||||
|
||||
class_ids = pipe.get_label_ids(words)
|
||||
|
||||
generator = torch.manual_seed(33)
|
||||
output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
|
||||
|
||||
image = output.images[0] # label 'white shark'
|
||||
```
|
||||
|
||||
## DiTPipeline
|
||||
[[autodoc]] DiTPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -46,7 +46,6 @@ available a colab notebook to directly try them out.
|
||||
|---|---|:---:|:---:|
|
||||
| [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
|
||||
| [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
|
||||
| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
|
||||
| [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
@@ -58,24 +57,13 @@ available a colab notebook to directly try them out.
|
||||
| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [semantic_stable_diffusion](./semantic_stable_diffusion) | [**SEGA: Instructing Diffusion using Semantic Dimensions**](https://arxiv.org/abs/2301.12247) | Text-to-Image Generation |
|
||||
| [stable_diffusion_text2img](./stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion_img2img](./stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion_inpaint](./stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_panorama](./stable_diffusion/panorama) | [**MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation**](https://arxiv.org/abs/2302.08113) | Text-Guided Panorama View Generation |
|
||||
| [stable_diffusion_pix2pix](./stable_diffusion/pix2pix) | [**InstructPix2Pix: Learning to Follow Image Editing Instructions**](https://arxiv.org/abs/2211.09800) | Text-Based Image Editing |
|
||||
| [stable_diffusion_pix2pix_zero](./stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://arxiv.org/abs/2302.03027) | Text-Based Image Editing |
|
||||
| [stable_diffusion_attend_and_excite](./stable_diffusion/attend_and_excite) | [**Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models**](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation |
|
||||
| [stable_diffusion_self_attention_guidance](./stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation |
|
||||
| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
|
||||
| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_2](./stable_diffusion_2/) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Depth-to-Image Text-Guided Generation |
|
||||
| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
|
||||
| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
|
||||
| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Semantic Guidance
|
||||
|
||||
Semantic Guidance for Diffusion Models was proposed in [SEGA: Instructing Diffusion using Semantic Dimensions](https://arxiv.org/abs/2301.12247) and provides strong semantic control over the image generation.
|
||||
Small changes to the text prompt usually result in entirely different output images. However, with SEGA a variety of changes to the image are enabled that can be controlled easily and intuitively, and stay true to the original image composition.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
|
||||
|
||||
|
||||
*Overview*:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_semantic_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/semantic-image-editing/blob/main/examples/SemanticGuidance.ipynb) | [Coming Soon](https://huggingface.co/AIML-TUDA)
|
||||
|
||||
## Tips
|
||||
|
||||
- The Semantic Guidance pipeline can be used with any [Stable Diffusion](./api/pipelines/stable_diffusion/text2img) checkpoint.
|
||||
|
||||
### Run Semantic Guidance
|
||||
|
||||
The interface of [`SemanticStableDiffusionPipeline`] provides several additional parameters to influence the image generation.
|
||||
Exemplary usage may look like this:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import SemanticStableDiffusionPipeline
|
||||
|
||||
pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
out = pipe(
|
||||
prompt="a photo of the face of a woman",
|
||||
num_images_per_prompt=1,
|
||||
guidance_scale=7,
|
||||
editing_prompt=[
|
||||
"smiling, smile", # Concepts to apply
|
||||
"glasses, wearing glasses",
|
||||
"curls, wavy hair, curly hair",
|
||||
"beard, full beard, mustache",
|
||||
],
|
||||
reverse_editing_direction=[False, False, False, False], # Direction of guidance i.e. increase all concepts
|
||||
edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
|
||||
edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
|
||||
edit_threshold=[
|
||||
0.99,
|
||||
0.975,
|
||||
0.925,
|
||||
0.96,
|
||||
], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
|
||||
edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
|
||||
edit_mom_beta=0.6, # Momentum beta
|
||||
edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
|
||||
)
|
||||
```
|
||||
|
||||
For more examples check the colab notebook.
|
||||
|
||||
## StableDiffusionSafePipelineOutput
|
||||
[[autodoc]] pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
## SemanticStableDiffusionPipeline
|
||||
[[autodoc]] SemanticStableDiffusionPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,75 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Attend and Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models
|
||||
|
||||
## Overview
|
||||
|
||||
Attend and Excite for Stable Diffusion was proposed in [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://attendandexcite.github.io/Attend-and-Excite/) and provides textual attention control over the image generation.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on a variety of tasks and provide evidence for its versatility and flexibility.*
|
||||
|
||||
Resources
|
||||
|
||||
* [Project Page](https://attendandexcite.github.io/Attend-and-Excite/)
|
||||
* [Paper](https://arxiv.org/abs/2301.13826)
|
||||
* [Original Code](https://github.com/AttendAndExcite/Attend-and-Excite)
|
||||
* [Demo](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_semantic_stable_diffusion_attend_and_excite.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_semantic_stable_diffusion_attend_and_excite) | *Text-to-Image Generation* | - | https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite
|
||||
|
||||
|
||||
### Usage example
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionAttendAndExcitePipeline
|
||||
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a cat and a frog"
|
||||
|
||||
# use get_indices function to find out indices of the tokens you want to alter
|
||||
pipe.get_indices(prompt)
|
||||
|
||||
token_indices = [2, 5]
|
||||
seed = 6141
|
||||
generator = torch.Generator("cuda").manual_seed(seed)
|
||||
|
||||
images = pipe(
|
||||
prompt=prompt,
|
||||
token_indices=token_indices,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
num_inference_steps=50,
|
||||
max_iter_to_alter=25,
|
||||
).images
|
||||
|
||||
image = images[0]
|
||||
image.save(f"../images/{prompt}_{seed}.png")
|
||||
```
|
||||
|
||||
|
||||
## StableDiffusionAttendAndExcitePipeline
|
||||
[[autodoc]] StableDiffusionAttendAndExcitePipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -1,167 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text-to-Image Generation with ControlNet Conditioning
|
||||
|
||||
## Overview
|
||||
|
||||
[Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
|
||||
|
||||
Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
|
||||
|
||||
This model was contributed by the amazing community contributor [takuma104](https://huggingface.co/takuma104) ❤️ .
|
||||
|
||||
Resources:
|
||||
|
||||
* [Paper](https://arxiv.org/abs/2302.05543)
|
||||
* [Original Code](https://github.com/lllyasviel/ControlNet)
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/controlnet.ipynb)
|
||||
|
||||
## Usage example
|
||||
|
||||
In the following we give a simple example of how to use a *ControlNet* checkpoint with Diffusers for inference.
|
||||
The inference pipeline is the same for all pipelines:
|
||||
|
||||
* 1. Take an image and run it through a pre-conditioning processor.
|
||||
* 2. Run the pre-processed image through the [`StableDiffusionControlNetPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Canny Edge ControlNet](https://huggingface.co/lllyasviel/sd-controlnet-canny).
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionControlNetPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Let's load the popular vermeer image
|
||||
image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
)
|
||||
```
|
||||
|
||||

|
||||
|
||||
Next, we process the image to get the canny image. This is step *1.* - running the pre-conditioning processor. The pre-conditioning processor is different for every ControlNet. Please see the model cards of the [official checkpoints](#controlnet-with-stable-diffusion-1.5) for more information about other models.
|
||||
|
||||
First, we need to install opencv:
|
||||
|
||||
```
|
||||
pip install opencv-contrib-python
|
||||
```
|
||||
|
||||
Next, let's also install all required Hugging Face libraries:
|
||||
|
||||
```
|
||||
pip install diffusers transformers git+https://github.com/huggingface/accelerate.git
|
||||
```
|
||||
|
||||
Then we can retrieve the canny edges of the image.
|
||||
|
||||
```python
|
||||
import cv2
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
```
|
||||
|
||||
Let's take a look at the processed image.
|
||||
|
||||

|
||||
|
||||
Now, we load the official [Stable Diffusion 1.5 Model](runwayml/stable-diffusion-v1-5) as well as the ControlNet for canny edges.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
import torch
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
```
|
||||
|
||||
To speed-up things and reduce memory, let's enable model offloading and use the fast [`UniPCMultistepScheduler`].
|
||||
|
||||
```py
|
||||
from diffusers import UniPCMultistepScheduler
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# this command loads the individual model components on GPU on-demand.
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Finally, we can run the pipeline:
|
||||
|
||||
```py
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
out_image = pipe(
|
||||
"disco dancer with colorful lights", num_inference_steps=20, generator=generator, image=canny_image
|
||||
).images[0]
|
||||
```
|
||||
|
||||
This should take only around 3-4 seconds on GPU (depending on hardware). The output image then looks as follows:
|
||||
|
||||

|
||||
|
||||
|
||||
**Note**: To see how to run all other ControlNet checkpoints, please have a look at [ControlNet with Stable Diffusion 1.5](#controlnet-with-stable-diffusion-1.5)
|
||||
|
||||
<!-- TODO: add space -->
|
||||
|
||||
## Available checkpoints
|
||||
|
||||
ControlNet requires a *control image* in addition to the text-to-image *prompt*.
|
||||
Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
|
||||
|
||||
All checkpoints can be found under the authors' namespace [lllyasviel](https://huggingface.co/lllyasviel).
|
||||
|
||||
### ControlNet with Stable Diffusion 1.5
|
||||
|
||||
| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
|
||||
|---|---|---|---|
|
||||
|[lllyasviel/sd-controlnet-canny](https://huggingface.co/lllyasviel/sd-controlnet-canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-depth](https://huggingface.co/lllyasviel/sd-controlnet-depth)<br/> *Trained with Midas depth estimation* |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-hed](https://huggingface.co/lllyasviel/sd-controlnet-hed)<br/> *Trained with HED edge detection (soft edge)* |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
|
||||
|[lllyasviel/sd-controlnet-mlsd](https://huggingface.co/lllyasviel/sd-controlnet-mlsd)<br/> *Trained with M-LSD line detection* |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-normal](https://huggingface.co/lllyasviel/sd-controlnet-normal)<br/> *Trained with normal map* |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-openpose](https://huggingface.co/lllyasviel/sd-controlnet_openpose)<br/> *Trained with OpenPose bone image* |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
|
||||
|[lllyasviel/sd-controlnet-scribble](https://huggingface.co/lllyasviel/sd-controlnet_scribble)<br/> *Trained with human scribbles* |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
|
||||
|[lllyasviel/sd-controlnet-seg](https://huggingface.co/lllyasviel/sd-controlnet_seg)<br/>*Trained with semantic segmentation* |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
|
||||
|
||||
## StableDiffusionControlNetPipeline
|
||||
[[autodoc]] StableDiffusionControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -20,17 +20,10 @@ The original codebase can be found here: [CampVis/stable-diffusion](https://gith
|
||||
|
||||
[`StableDiffusionImg2ImgPipeline`] is compatible with all Stable Diffusion checkpoints for [Text-to-Image](./text2img)
|
||||
|
||||
The pipeline uses the diffusion-denoising mechanism proposed by SDEdit ([SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations](https://arxiv.org/abs/2108.01073)
|
||||
proposed by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, Stefano Ermon).
|
||||
|
||||
[[autodoc]] StableDiffusionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -30,8 +30,4 @@ Available checkpoints are:
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,33 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Diffusion Latent Upscaler
|
||||
|
||||
## StableDiffusionLatentUpscalePipeline
|
||||
|
||||
The Stable Diffusion Latent Upscaler model was created by [Katherine Crowson](https://github.com/crowsonkb/k-diffusion) in collaboration with [Stability AI](https://stability.ai/). It can be used on top of any [`StableDiffusionUpscalePipeline`] checkpoint to enhance its output image resolution by a factor of 2.
|
||||
|
||||
A notebook that demonstrates the original implementation can be found here:
|
||||
- [Stable Diffusion Upscaler Demo](https://colab.research.google.com/drive/1o1qYJcFeywzCIdkfKJy7cTpgZTCM2EI4)
|
||||
|
||||
Available Checkpoints are:
|
||||
- *stabilityai/latent-upscaler*: [stabilityai/sd-x2-latent-upscaler](https://huggingface.co/stabilityai/sd-x2-latent-upscaler)
|
||||
|
||||
|
||||
[[autodoc]] StableDiffusionLatentUpscalePipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_sequential_cpu_offload
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -31,10 +31,6 @@ For more details about how Stable Diffusion works and how it differs from the ba
|
||||
| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon
|
||||
| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations)
|
||||
| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
|
||||
| [StableDiffusionLatentUpscalePipeline](./latent_upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
|
||||
| [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix)
|
||||
| [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)
|
||||
| [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation
|
||||
|
||||
## Overview
|
||||
|
||||
[MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation](https://arxiv.org/abs/2302.08113) by Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Recent advances in text-to-image generation with diffusion models present transformative capabilities in image quality. However, user controllability of the generated image, and fast adaptation to new tasks still remains an open challenge, currently mostly addressed by costly and long re-training and fine-tuning or ad-hoc adaptations to specific image generation tasks. In this work, we present MultiDiffusion, a unified framework that enables versatile and controllable image generation, using a pre-trained text-to-image diffusion model, without any further training or finetuning. At the center of our approach is a new generation process, based on an optimization task that binds together multiple diffusion generation processes with a shared set of parameters or constraints. We show that MultiDiffusion can be readily applied to generate high quality and diverse images that adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes.
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://multidiffusion.github.io/).
|
||||
* [Paper](https://arxiv.org/abs/2302.08113).
|
||||
* [Original Code](https://github.com/omerbt/MultiDiffusion).
|
||||
* [Demo](https://huggingface.co/spaces/weizmannscience/MultiDiffusion).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionPanoramaPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py) | *Text-Guided Panorama View Generation* | [🤗 Space](https://huggingface.co/spaces/weizmannscience/MultiDiffusion)) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
|
||||
|
||||
model_ckpt = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, torch_dtype=torch.float16)
|
||||
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of the dolomites"
|
||||
image = pipe(prompt).images[0]
|
||||
image.save("dolomites.png")
|
||||
```
|
||||
|
||||
## StableDiffusionPanoramaPipeline
|
||||
[[autodoc]] StableDiffusionPanoramaPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,70 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# InstructPix2Pix: Learning to Follow Image Editing Instructions
|
||||
|
||||
## Overview
|
||||
|
||||
[InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) by Tim Brooks, Aleksander Holynski and Alexei A. Efros.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*We propose a method for editing images from human instructions: given an input image and a written instruction that tells the model what to do, our model follows these instructions to edit the image. To obtain training data for this problem, we combine the knowledge of two large pretrained models -- a language model (GPT-3) and a text-to-image model (Stable Diffusion) -- to generate a large dataset of image editing examples. Our conditional diffusion model, InstructPix2Pix, is trained on our generated data, and generalizes to real images and user-written instructions at inference time. Since it performs edits in the forward pass and does not require per example fine-tuning or inversion, our model edits images quickly, in a matter of seconds. We show compelling editing results for a diverse collection of input images and written instructions.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://www.timothybrooks.com/instruct-pix2pix).
|
||||
* [Paper](https://arxiv.org/abs/2211.09800).
|
||||
* [Original Code](https://github.com/timothybrooks/instruct-pix2pix).
|
||||
* [Demo](https://huggingface.co/spaces/timbrooks/instruct-pix2pix).
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionInstructPix2PixPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
|
||||
model_id = "timbrooks/instruct-pix2pix"
|
||||
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
|
||||
|
||||
|
||||
def download_image(url):
|
||||
image = PIL.Image.open(requests.get(url, stream=True).raw)
|
||||
image = PIL.ImageOps.exif_transpose(image)
|
||||
image = image.convert("RGB")
|
||||
return image
|
||||
|
||||
|
||||
image = download_image(url)
|
||||
|
||||
prompt = "make the mountains snowy"
|
||||
images = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images
|
||||
images[0].save("snowy_mountains.png")
|
||||
```
|
||||
|
||||
## StableDiffusionInstructPix2PixPipeline
|
||||
[[autodoc]] StableDiffusionInstructPix2PixPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,291 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Zero-shot Image-to-Image Translation
|
||||
|
||||
## Overview
|
||||
|
||||
[Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027).
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Large-scale text-to-image generative models have shown their remarkable ability to synthesize diverse and high-quality images. However, it is still challenging to directly apply these models for editing real images for two reasons. First, it is hard for users to come up with a perfect text prompt that accurately describes every visual detail in the input image. Second, while existing models can introduce desirable changes in certain regions, they often dramatically alter the input content and introduce unexpected changes in unwanted regions. In this work, we propose pix2pix-zero, an image-to-image translation method that can preserve the content of the original image without manual prompting. We first automatically discover editing directions that reflect desired edits in the text embedding space. To preserve the general content structure after editing, we further propose cross-attention guidance, which aims to retain the cross-attention maps of the input image throughout the diffusion process. In addition, our method does not need additional training for these edits and can directly use the existing pre-trained text-to-image diffusion model. We conduct extensive experiments and show that our method outperforms existing and concurrent works for both real and synthetic image editing.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://pix2pixzero.github.io/).
|
||||
* [Paper](https://arxiv.org/abs/2302.03027).
|
||||
* [Original Code](https://github.com/pix2pixzero/pix2pix-zero).
|
||||
* [Demo](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo).
|
||||
|
||||
## Tips
|
||||
|
||||
* The pipeline can be conditioned on real input images. Check out the code examples below to know more.
|
||||
* The pipeline exposes two arguments namely `source_embeds` and `target_embeds`
|
||||
that let you control the direction of the semantic edits in the final image to be generated. Let's say,
|
||||
you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
|
||||
this in the pipeline, you simply have to set the embeddings related to the phrases including "cat" to
|
||||
`source_embeds` and "dog" to `target_embeds`. Refer to the code example below for more details.
|
||||
* When you're using this pipeline from a prompt, specify the _source_ concept in the prompt. Taking
|
||||
the above example, a valid input prompt would be: "a high resolution painting of a **cat** in the style of van gough".
|
||||
* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
|
||||
* Swap the `source_embeds` and `target_embeds`.
|
||||
* Change the input prompt to include "dog".
|
||||
* To learn more about how the source and target embeddings are generated, refer to the [original
|
||||
paper](https://arxiv.org/abs/2302.03027). Below, we also provide some directions on how to generate the embeddings.
|
||||
* Note that the quality of the outputs generated with this pipeline is dependent on how good the `source_embeds` and `target_embeds` are. Please, refer to [this discussion](#generating-source-and-target-embeddings) for some suggestions on the topic.
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionPix2PixZeroPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo) |
|
||||
|
||||
<!-- TODO: add Colab -->
|
||||
|
||||
## Usage example
|
||||
|
||||
### Based on an image generated with the input prompt
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
|
||||
def download(embedding_url, local_filepath):
|
||||
r = requests.get(embedding_url)
|
||||
with open(local_filepath, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
|
||||
model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
model_ckpt, conditions_input_image=False, torch_dtype=torch.float16
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.to("cuda")
|
||||
|
||||
prompt = "a high resolution painting of a cat in the style of van gogh"
|
||||
src_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/cat.pt"
|
||||
target_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/dog.pt"
|
||||
|
||||
for url in [src_embs_url, target_embs_url]:
|
||||
download(url, url.split("/")[-1])
|
||||
|
||||
src_embeds = torch.load(src_embs_url.split("/")[-1])
|
||||
target_embeds = torch.load(target_embs_url.split("/")[-1])
|
||||
|
||||
images = pipeline(
|
||||
prompt,
|
||||
source_embeds=src_embeds,
|
||||
target_embeds=target_embeds,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
).images
|
||||
images[0].save("edited_image_dog.png")
|
||||
```
|
||||
|
||||
### Based on an input image
|
||||
|
||||
When the pipeline is conditioned on an input image, we first obtain an inverted
|
||||
noise from it using a `DDIMInverseScheduler` with the help of a generated caption. Then
|
||||
the inverted noise is used to start the generation process.
|
||||
|
||||
First, let's load our pipeline:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import BlipForConditionalGeneration, BlipProcessor
|
||||
from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
captioner_id = "Salesforce/blip-image-captioning-base"
|
||||
processor = BlipProcessor.from_pretrained(captioner_id)
|
||||
model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
||||
|
||||
sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
sd_model_ckpt,
|
||||
caption_generator=model,
|
||||
caption_processor=processor,
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None,
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Then, we load an input image for conditioning and obtain a suitable caption for it:
|
||||
|
||||
```py
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
|
||||
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
|
||||
caption = pipeline.generate_caption(raw_image)
|
||||
```
|
||||
|
||||
Then we employ the generated caption and the input image to get the inverted noise:
|
||||
|
||||
```py
|
||||
generator = torch.manual_seed(0)
|
||||
inv_latents = pipeline.invert(caption, image=raw_image, generator=generator).latents
|
||||
```
|
||||
|
||||
Now, generate the image with edit directions:
|
||||
|
||||
```py
|
||||
# See the "Generating source and target embeddings" section below to
|
||||
# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
|
||||
source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
|
||||
target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
|
||||
|
||||
source_embeds = pipeline.get_embeds(source_prompts, batch_size=2)
|
||||
target_embeds = pipeline.get_embeds(target_prompts, batch_size=2)
|
||||
|
||||
|
||||
image = pipeline(
|
||||
caption,
|
||||
source_embeds=source_embeds,
|
||||
target_embeds=target_embeds,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
generator=generator,
|
||||
latents=inv_latents,
|
||||
negative_prompt=caption,
|
||||
).images[0]
|
||||
image.save("edited_image.png")
|
||||
```
|
||||
|
||||
## Generating source and target embeddings
|
||||
|
||||
The authors originally used the [GPT-3 API](https://openai.com/api/) to generate the source and target captions for discovering
|
||||
edit directions. However, we can also leverage open source and public models for the same purpose.
|
||||
Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
|
||||
for generating captions and [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for
|
||||
computing embeddings on the generated captions.
|
||||
|
||||
**1. Load the generation model**:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5ForConditionalGeneration
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
|
||||
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
**2. Construct a starting prompt**:
|
||||
|
||||
```py
|
||||
source_concept = "cat"
|
||||
target_concept = "dog"
|
||||
|
||||
source_text = f"Provide a caption for images containing a {source_concept}. "
|
||||
"The captions should be in English and should be no longer than 150 characters."
|
||||
|
||||
target_text = f"Provide a caption for images containing a {target_concept}. "
|
||||
"The captions should be in English and should be no longer than 150 characters."
|
||||
```
|
||||
|
||||
Here, we're interested in the "cat -> dog" direction.
|
||||
|
||||
**3. Generate captions**:
|
||||
|
||||
We can use a utility like so for this purpose.
|
||||
|
||||
```py
|
||||
def generate_captions(input_prompt):
|
||||
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
|
||||
|
||||
outputs = model.generate(
|
||||
input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
|
||||
)
|
||||
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
```
|
||||
|
||||
And then we just call it to generate our captions:
|
||||
|
||||
```py
|
||||
source_captions = generate_captions(source_text)
|
||||
target_captions = generate_captions(target_concept)
|
||||
```
|
||||
|
||||
We encourage you to play around with the different parameters supported by the
|
||||
`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
|
||||
|
||||
**4. Load the embedding model**:
|
||||
|
||||
Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPix2PixZeroPipeline
|
||||
|
||||
pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = pipeline.to("cuda")
|
||||
tokenizer = pipeline.tokenizer
|
||||
text_encoder = pipeline.text_encoder
|
||||
```
|
||||
|
||||
**5. Compute embeddings**:
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
def embed_captions(sentences, tokenizer, text_encoder, device="cuda"):
|
||||
with torch.no_grad():
|
||||
embeddings = []
|
||||
for sent in sentences:
|
||||
text_inputs = tokenizer(
|
||||
sent,
|
||||
padding="max_length",
|
||||
max_length=tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
|
||||
embeddings.append(prompt_embeds)
|
||||
return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
|
||||
|
||||
source_embeddings = embed_captions(source_captions, tokenizer, text_encoder)
|
||||
target_embeddings = embed_captions(target_captions, tokenizer, text_encoder)
|
||||
```
|
||||
|
||||
And you're done! [Here](https://colab.research.google.com/drive/1tz2C1EdfZYAPlzXXbTnf-5PRBiR8_R1F?usp=sharing) is a Colab Notebook that you can use to interact with the entire process.
|
||||
|
||||
Now, you can use these embeddings directly while calling the pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import DDIMScheduler
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
images = pipeline(
|
||||
prompt,
|
||||
source_embeds=source_embeddings,
|
||||
target_embeds=target_embeddings,
|
||||
num_inference_steps=50,
|
||||
cross_attention_guidance_amount=0.15,
|
||||
).images
|
||||
images[0].save("edited_image_dog.png")
|
||||
```
|
||||
|
||||
## StableDiffusionPix2PixZeroPipeline
|
||||
[[autodoc]] StableDiffusionPix2PixZeroPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,64 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Self-Attention Guidance (SAG)
|
||||
|
||||
## Overview
|
||||
|
||||
[Self-Attention Guidance](https://arxiv.org/abs/2210.00939) by Susung Hong et al.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
*Denoising diffusion models (DDMs) have been drawing much attention for their appreciable sample quality and diversity. Despite their remarkable performance, DDMs remain black boxes on which further study is necessary to take a profound step. Motivated by this, we delve into the design of conventional U-shaped diffusion models. More specifically, we investigate the self-attention modules within these models through carefully designed experiments and explore their characteristics. In addition, inspired by the studies that substantiate the effectiveness of the guidance schemes, we present plug-and-play diffusion guidance, namely Self-Attention Guidance (SAG), that can drastically boost the performance of existing diffusion models. Our method, SAG, extracts the intermediate attention map from a diffusion model at every iteration and selects tokens above a certain attention score for masking and blurring to obtain a partially blurred input. Subsequently, we measure the dissimilarity between the predicted noises obtained from feeding the blurred and original input to the diffusion model and leverage it as guidance. With this guidance, we observe apparent improvements in a wide range of diffusion models, e.g., ADM, IDDPM, and Stable Diffusion, and show that the results further improve by combining our method with the conventional guidance scheme. We provide extensive ablation studies to verify our choices.*
|
||||
|
||||
Resources:
|
||||
|
||||
* [Project Page](https://ku-cvlab.github.io/Self-Attention-Guidance).
|
||||
* [Paper](https://arxiv.org/abs/2210.00939).
|
||||
* [Original Code](https://github.com/KU-CVLAB/Self-Attention-Guidance).
|
||||
* [Demo](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb).
|
||||
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionSAGPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py) | *Text-to-Image Generation* | [Colab](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb) |
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionSAGPipeline
|
||||
from accelerate.utils import set_seed
|
||||
|
||||
pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
seed = 8978
|
||||
prompt = "."
|
||||
guidance_scale = 7.5
|
||||
num_images_per_prompt = 1
|
||||
|
||||
sag_scale = 1.0
|
||||
|
||||
set_seed(seed)
|
||||
images = pipe(
|
||||
prompt, num_images_per_prompt=num_images_per_prompt, guidance_scale=guidance_scale, sag_scale=sag_scale
|
||||
).images
|
||||
images[0].save("example.png")
|
||||
```
|
||||
|
||||
## StableDiffusionSAGPipeline
|
||||
[[autodoc]] StableDiffusionSAGPipeline
|
||||
- __call__
|
||||
- all
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -17,7 +17,7 @@ specific language governing permissions and limitations under the License.
|
||||
The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion.
|
||||
|
||||
The original codebase can be found here:
|
||||
- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
||||
- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
||||
- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion)
|
||||
|
||||
Available Checkpoints are:
|
||||
@@ -36,10 +36,4 @@ Available Checkpoints are:
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- enable_vae_tiling
|
||||
- disable_vae_tiling
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionPipeline
|
||||
- all
|
||||
- __call__
|
||||
- disable_xformers_memory_efficient_attention
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -24,7 +24,7 @@ The abstract of the paper is the following:
|
||||
|
||||
| Pipeline | Tasks | Colab | Demo
|
||||
|---|---|:---:|:---:|
|
||||
| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | [](https://huggingface.co/spaces/AIML-TUDA/unsafe-vs-safe-stable-diffusion)
|
||||
| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | -
|
||||
|
||||
## Tips
|
||||
|
||||
@@ -58,7 +58,7 @@ You may use the 4 configurations defined in the [Safe Latent Diffusion paper](ht
|
||||
>>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
|
||||
```
|
||||
|
||||
The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONG`, and `SafetyConfig.MAX`.
|
||||
The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONg`, and `SafetyConfig.MAX`.
|
||||
|
||||
### How to load and use different schedulers.
|
||||
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable unCLIP
|
||||
|
||||
Stable unCLIP checkpoints are finetuned from [stable diffusion 2.1](./stable_diffusion_2) checkpoints to condition on CLIP image embeddings.
|
||||
Stable unCLIP also still conditions on text embeddings. Given the two separate conditionings, stable unCLIP can be used
|
||||
for text guided image variation. When combined with an unCLIP prior, it can also be used for full text to image generation.
|
||||
|
||||
## Tips
|
||||
|
||||
Stable unCLIP takes a `noise_level` as input during inference. `noise_level` determines how much noise is added
|
||||
to the image embeddings. A higher `noise_level` increases variation in the final un-noised images. By default,
|
||||
we do not add any additional noise to the image embeddings i.e. `noise_level = 0`.
|
||||
|
||||
### Available checkpoints:
|
||||
|
||||
TODO
|
||||
|
||||
### Text-to-Image Generation
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableUnCLIPPipeline
|
||||
|
||||
pipe = StableUnCLIPPipeline.from_pretrained(
|
||||
"fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
|
||||
) # TODO update model path
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
images = pipe(prompt).images
|
||||
images[0].save("astronaut_horse.png")
|
||||
```
|
||||
|
||||
|
||||
### Text guided Image-to-Image Variation
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
from diffusers import StableUnCLIPImg2ImgPipeline
|
||||
|
||||
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
|
||||
"fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
|
||||
) # TODO update model path
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt, init_image).images
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
### StableUnCLIPPipeline
|
||||
|
||||
[[autodoc]] StableUnCLIPPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
|
||||
### StableUnCLIPImg2ImgPipeline
|
||||
|
||||
[[autodoc]] StableUnCLIPImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Inverse Denoising Diffusion Implicit Models (DDIMInverse)
|
||||
|
||||
## Overview
|
||||
|
||||
This scheduler is the inverted scheduler of [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
|
||||
The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf)
|
||||
|
||||
## DDIMInverseScheduler
|
||||
[[autodoc]] DDIMInverseScheduler
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -37,18 +37,16 @@ To this end, the design of schedulers is such that:
|
||||
|
||||
- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
|
||||
- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
|
||||
- Many diffusion pipelines, such as [`StableDiffusionPipeline`] and [`DiTPipeline`] can use any of [`KarrasDiffusionSchedulers`]
|
||||
|
||||
## Schedulers Summary
|
||||
|
||||
The following table summarizes all officially supported schedulers, their corresponding paper
|
||||
|
||||
|
||||
| Scheduler | Paper |
|
||||
|---|---|
|
||||
| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
|
||||
| [ddim_inverse](./ddim_inverse) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
|
||||
| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) |
|
||||
| [deis](./deis) | [**DEISMultistepScheduler**](https://arxiv.org/abs/2204.13902) |
|
||||
| [singlestep_dpm_solver](./singlestep_dpm_solver) | [**Singlestep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
|
||||
| [multistep_dpm_solver](./multistep_dpm_solver) | [**Multistep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
|
||||
| [heun](./heun) | [**Heun scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
|
||||
@@ -63,7 +61,6 @@ The following table summarizes all officially supported schedulers, their corres
|
||||
| [euler](./euler) | [**Euler scheduler**](https://arxiv.org/abs/2206.00364) |
|
||||
| [euler_ancestral](./euler_ancestral) | [**Euler Ancestral scheduler**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) |
|
||||
| [vq_diffusion](./vq_diffusion) | [**VQDiffusionScheduler**](https://arxiv.org/abs/2111.14822) |
|
||||
| [unipc](./unipc) | [**UniPCMultistepScheduler**](https://arxiv.org/abs/2302.04867) |
|
||||
| [repaint](./repaint) | [**RePaint scheduler**](https://arxiv.org/abs/2201.09865) |
|
||||
|
||||
## API
|
||||
@@ -83,10 +80,4 @@ The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...
|
||||
|
||||
[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
|
||||
|
||||
### KarrasDiffusionSchedulers
|
||||
|
||||
`KarrasDiffusionSchedulers` encompasses the main generalization of schedulers in Diffusers. The schedulers in this class are distinguished, at a high level, by their noise sampling strategy; the type of network and scaling; and finally the training strategy or how the loss is weighed.
|
||||
|
||||
The different schedulers, depending on the type of ODE solver, fall into the above taxonomy and provide a good abstraction for the design of the main schedulers implemented in Diffusers. The schedulers in this class are given below:
|
||||
|
||||
[[autodoc]] schedulers.scheduling_utils.KarrasDiffusionSchedulers
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# UniPC
|
||||
|
||||
## Overview
|
||||
|
||||
UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.
|
||||
|
||||
For more details about the method, please refer to the [[paper]](https://arxiv.org/abs/2302.04867) and the [[code]](https://github.com/wl-zhao/UniPC).
|
||||
|
||||
Fast Sampling of Diffusion Models with Exponential Integrator.
|
||||
|
||||
## UniPCMultistepScheduler
|
||||
[[autodoc]] UniPCMultistepScheduler
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -177,7 +177,7 @@ Follow these steps to start contributing ([supported Python versions](https://gi
|
||||
$ make style
|
||||
```
|
||||
|
||||
🧨 Diffusers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
|
||||
🧨 Diffusers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
# 🧨 Diffusers’ Ethical Guidelines
|
||||
|
||||
## Preamble
|
||||
|
||||
[Diffusers](https://huggingface.co/docs/diffusers/index) provides pre-trained diffusion models and serves as a modular toolbox for inference and training.
|
||||
|
||||
Given its real case applications in the world and potential negative impacts on society, we think it is important to provide the project with ethical guidelines to guide the development, users’ contributions, and usage of the Diffusers library.
|
||||
|
||||
The risks associated with using this technology are still being examined, but to name a few: copyrights issues for artists; deep-fake exploitation; sexual content generation in inappropriate contexts; non-consensual impersonation; harmful social biases perpetuating the oppression of marginalized groups.
|
||||
We will keep tracking risks and adapt the following guidelines based on the community's responsiveness and valuable feedback.
|
||||
|
||||
|
||||
## Scope
|
||||
|
||||
The Diffusers community will apply the following ethical guidelines to the project’s development and help coordinate how the community will integrate the contributions, especially concerning sensitive topics related to ethical concerns.
|
||||
|
||||
|
||||
## Ethical guidelines
|
||||
|
||||
The following ethical guidelines apply generally, but we will primarily implement them when dealing with ethically sensitive issues while making a technical choice. Furthermore, we commit to adapting those ethical principles over time following emerging harms related to the state of the art of the technology in question.
|
||||
|
||||
- **Transparency**: we are committed to being transparent in managing PRs, explaining our choices to users, and making technical decisions.
|
||||
|
||||
- **Consistency**: we are committed to guaranteeing our users the same level of attention in project management, keeping it technically stable and consistent.
|
||||
|
||||
- **Simplicity**: with a desire to make it easy to use and exploit the Diffusers library, we are committed to keeping the project’s goals lean and coherent.
|
||||
|
||||
- **Accessibility**: the Diffusers project helps lower the entry bar for contributors who can help run it even without technical expertise. Doing so makes research artifacts more accessible to the community.
|
||||
|
||||
- **Reproducibility**: we aim to be transparent about the reproducibility of upstream code, models, and datasets when made available through the Diffusers library.
|
||||
|
||||
- **Responsibility**: as a community and through teamwork, we hold a collective responsibility to our users by anticipating and mitigating this technology's potential risks and dangers.
|
||||
|
||||
|
||||
## Examples of implementations: Safety features and Mechanisms
|
||||
|
||||
The team works daily to make the technical and non-technical tools available to deal with the potential ethical and social risks associated with diffusion technology. Moreover, the community's input is invaluable in ensuring these features' implementation and raising awareness with us.
|
||||
|
||||
- [**Community tab**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): it enables the community to discuss and better collaborate on a project.
|
||||
|
||||
- **Bias exploration and evaluation**: the Hugging Face team provides a [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer) to demonstrate the biases in Stable Diffusion interactively. In this sense, we support and encourage bias explorers and evaluations.
|
||||
|
||||
- **Encouraging safety in deployment**
|
||||
|
||||
- [**Safe Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_safe): It mitigates the well-known issue that models, like Stable Diffusion, that are trained on unfiltered, web-crawled datasets tend to suffer from inappropriate degeneration. Related paper: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105).
|
||||
|
||||
- **Staged released on the Hub**: in particularly sensitive situations, access to some repositories should be restricted. This staged release is an intermediary step that allows the repository’s authors to have more control over its use.
|
||||
|
||||
- **Licensing**: [OpenRAILs](https://huggingface.co/blog/open_rail), a new type of licensing, allow us to ensure free access while having a set of restrictions that ensure more responsible use.
|
||||
@@ -1,565 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Evaluating Diffusion Models
|
||||
|
||||
<a target="_blank" href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/evaluation.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
</a>
|
||||
|
||||
Evaluation of generative models like [Stable Diffusion](https://huggingface.co/docs/diffusers/stable_diffusion) is subjective in nature. But as practitioners and researchers, we often have to make careful choices amongst many different possibilities. So, when working with different generative models (like GANs, Diffusion, etc.), how do we choose one over the other?
|
||||
|
||||
Qualitative evaluation of such models can be error-prone and might incorrectly influence a decision.
|
||||
However, quantitative metrics don't necessarily correspond to image quality. So, usually, a combination
|
||||
of both qualitative and quantitative evaluations provides a stronger signal when choosing one model
|
||||
over the other.
|
||||
|
||||
In this document, we provide a non-exhaustive overview of qualitative and quantitative methods to evaluate Diffusion models. For quantitative methods, we specifically focus on how to implement them alongside `diffusers`.
|
||||
|
||||
The methods shown in this document can also be used to evaluate different [noise schedulers](https://huggingface.co/docs/diffusers/main/en/api/schedulers/overview) keeping the underlying generation model fixed.
|
||||
|
||||
## Scenarios
|
||||
|
||||
We cover Diffusion models with the following pipelines:
|
||||
|
||||
- Text-guided image generation (such as the [`StableDiffusionPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/text2img)).
|
||||
- Text-guided image generation, additionally conditioned on an input image (such as the [`StableDiffusionImg2ImgPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/img2img), and [`StableDiffusionInstructPix2PixPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/pix2pix)).
|
||||
- Class-conditioned image generation models (such as the [`DiTPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/dit)).
|
||||
|
||||
## Qualitative Evaluation
|
||||
|
||||
Qualitative evaluation typically involves human assessment of generated images. Quality is measured across aspects such as compositionality, image-text alignment, and spatial relations. Common prompts provide a degree of uniformity for subjective metrics. DrawBench and PartiPrompts are prompt datasets used for qualitative benchmarking. DrawBench and PartiPrompts were introduced by [Imagen](https://imagen.research.google/) and [Parti](https://parti.research.google/) respectively.
|
||||
|
||||
From the [official Parti website](https://parti.research.google/):
|
||||
|
||||
> PartiPrompts (P2) is a rich set of over 1600 prompts in English that we release as part of this work. P2 can be used to measure model capabilities across various categories and challenge aspects.
|
||||
|
||||

|
||||
|
||||
PartiPrompts has the following columns:
|
||||
|
||||
- Prompt
|
||||
- Category of the prompt (such as “Abstract”, “World Knowledge”, etc.)
|
||||
- Challenge reflecting the difficulty (such as “Basic”, “Complex”, “Writing & Symbols”, etc.)
|
||||
|
||||
These benchmarks allow for side-by-side human evaluation of different image generation models. Let’s see how we can use `diffusers` on a couple of PartiPrompts.
|
||||
|
||||
Below we show some prompts sampled across different challenges: Basic, Complex, Linguistic Structures, Imagination, and Writing & Symbols. Here we are using PartiPrompts as a [dataset](https://huggingface.co/datasets/nateraw/parti-prompts).
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
# prompts = load_dataset("nateraw/parti-prompts", split="train")
|
||||
# prompts = prompts.shuffle()
|
||||
# sample_prompts = [prompts[i]["Prompt"] for i in range(5)]
|
||||
|
||||
# Fixing these sample prompts in the interest of reproducibility.
|
||||
sample_prompts = [
|
||||
"a corgi",
|
||||
"a hot air balloon with a yin-yang symbol, with the moon visible in the daytime sky",
|
||||
"a car with no windows",
|
||||
"a cube made of porcupine",
|
||||
'The saying "BE EXCELLENT TO EACH OTHER" written on a red brick wall with a graffiti image of a green alien wearing a tuxedo. A yellow fire hydrant is on a sidewalk in the foreground.',
|
||||
]
|
||||
```
|
||||
|
||||
Now we can use these prompts to generate some images using Stable Diffusion ([v1-4 checkpoint](https://huggingface.co/CompVis/stable-diffusion-v1-4)):
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
seed = 0
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
images = sd_pipeline(sample_prompts, num_images_per_prompt=1, generator=generator, output_type="numpy").images
|
||||
```
|
||||
|
||||

|
||||
|
||||
We can also set `num_images_per_prompt` accordingly to compare different images for the same prompt. Running the same pipeline but with a different checkpoint ([v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)), yields:
|
||||
|
||||

|
||||
|
||||
Once several images are generated from all the prompts using multiple models (under evaluation), these results are presented to human evaluators for scoring. For
|
||||
more details on the DrawBench and PartiPrompts benchmarks, refer to their respective papers.
|
||||
|
||||
<Tip>
|
||||
|
||||
It is useful to look at some inference samples while a model is training to measure the
|
||||
training progress. In our [training scripts](https://github.com/huggingface/diffusers/tree/main/examples/), we support this utility with additional support for
|
||||
logging to TensorBoard and Weights & Biases.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Quantitative Evaluation
|
||||
|
||||
In this section, we will walk you through how to evaluate three different diffusion pipelines using:
|
||||
|
||||
- CLIP score
|
||||
- CLIP directional similarity
|
||||
- FID
|
||||
|
||||
### Text-guided image generation
|
||||
|
||||
[CLIP score](https://arxiv.org/abs/2104.08718) measures the compatibility of image-caption pairs. Higher CLIP scores imply higher compatibility 🔼. The CLIP score is a quantitative measurement of the qualitative concept "compatibility". Image-caption pair compatibility can also be thought of as the semantic similarity between the image and the caption. CLIP score was found to have high correlation with human judgement.
|
||||
|
||||
Let's first load a [`StableDiffusionPipeline`]:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
sd_pipeline = StableDiffusionPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Generate some images with multiple prompts:
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
"a photo of an astronaut riding a horse on mars",
|
||||
"A high tech solarpunk utopia in the Amazon rainforest",
|
||||
"A pikachu fine dining with a view to the Eiffel Tower",
|
||||
"A mecha robot in a favela in expressionist style",
|
||||
"an insect robot preparing a delicious meal",
|
||||
"A small cabin on top of a snowy mountain in the style of Disney, artstation",
|
||||
]
|
||||
|
||||
images = sd_pipeline(prompts, num_images_per_prompt=1, output_type="numpy").images
|
||||
|
||||
print(images.shape)
|
||||
# (6, 512, 512, 3)
|
||||
```
|
||||
|
||||
And then, we calculate the CLIP score.
|
||||
|
||||
```python
|
||||
from torchmetrics.functional.multimodal import clip_score
|
||||
from functools import partial
|
||||
|
||||
clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")
|
||||
|
||||
|
||||
def calculate_clip_score(images, prompts):
|
||||
images_int = (images * 255).astype("uint8")
|
||||
clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
|
||||
return round(float(clip_score), 4)
|
||||
|
||||
|
||||
sd_clip_score = calculate_clip_score(images, prompts)
|
||||
print(f"CLIP score: {sd_clip_score}")
|
||||
# CLIP score: 35.7038
|
||||
```
|
||||
|
||||
In the above example, we generated one image per prompt. If we generated multiple images per prompt, we would have to take the average score from the generated images per prompt.
|
||||
|
||||
Now, if we wanted to compare two checkpoints compatible with the [`StableDiffusionPipeline`] we should pass a generator while calling the pipeline. First, we generate images with a
|
||||
fixed seed with the [v1-4 Stable Diffusion checkpoint](https://huggingface.co/CompVis/stable-diffusion-v1-4):
|
||||
|
||||
```python
|
||||
seed = 0
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
images = sd_pipeline(prompts, num_images_per_prompt=1, generator=generator, output_type="numpy").images
|
||||
```
|
||||
|
||||
Then we load the [v1-5 checkpoint](https://huggingface.co/runwayml/stable-diffusion-v1-5) to generate images:
|
||||
|
||||
```python
|
||||
model_ckpt_1_5 = "runwayml/stable-diffusion-v1-5"
|
||||
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)
|
||||
|
||||
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="numpy").images
|
||||
```
|
||||
|
||||
And finally, we compare their CLIP scores:
|
||||
|
||||
```python
|
||||
sd_clip_score_1_4 = calculate_clip_score(images, prompts)
|
||||
print(f"CLIP Score with v-1-4: {sd_clip_score_1_4}")
|
||||
# CLIP Score with v-1-4: 34.9102
|
||||
|
||||
sd_clip_score_1_5 = calculate_clip_score(images_1_5, prompts)
|
||||
print(f"CLIP Score with v-1-5: {sd_clip_score_1_5}")
|
||||
# CLIP Score with v-1-5: 36.2137
|
||||
```
|
||||
|
||||
It seems like the [v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint performs better than its predecessor. Note, however, that the number of prompts we used to compute the CLIP scores is quite low. For a more practical evaluation, this number should be way higher, and the prompts should be diverse.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
By construction, there are some limitations in this score. The captions in the training dataset
|
||||
were crawled from the web and extracted from `alt` and similar tags associated an image on the internet.
|
||||
They are not necessarily representative of what a human being would use to describe an image. Hence we
|
||||
had to "engineer" some prompts here.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Image-conditioned text-to-image generation
|
||||
|
||||
In this case, we condition the generation pipeline with an input image as well as a text prompt. Let's take the [`StableDiffusionInstructPix2PixPipeline`], as an example. It takes an edit instruction as an input prompt and an input image to be edited.
|
||||
|
||||
Here is one example:
|
||||
|
||||

|
||||
|
||||
One strategy to evaluate such a model is to measure the consistency of the change between the two images (in [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) space) with the change between the two image captions (as shown in [CLIP-Guided Domain Adaptation of Image Generators](https://arxiv.org/abs/2108.00946)). This is referred to as the "**CLIP directional similarity**".
|
||||
|
||||
- Caption 1 corresponds to the input image (image 1) that is to be edited.
|
||||
- Caption 2 corresponds to the edited image (image 2). It should reflect the edit instruction.
|
||||
|
||||
Following is a pictorial overview:
|
||||
|
||||

|
||||
|
||||
We have prepared a mini dataset to implement this metric. Let's first load the dataset.
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("sayakpaul/instructpix2pix-demo", split="train")
|
||||
dataset.features
|
||||
```
|
||||
|
||||
```bash
|
||||
{'input': Value(dtype='string', id=None),
|
||||
'edit': Value(dtype='string', id=None),
|
||||
'output': Value(dtype='string', id=None),
|
||||
'image': Image(decode=True, id=None)}
|
||||
```
|
||||
|
||||
Here we have:
|
||||
|
||||
- `input` is a caption corresponding to the `image`.
|
||||
- `edit` denotes the edit instruction.
|
||||
- `output` denotes the modified caption reflecting the `edit` instruction.
|
||||
|
||||
Let's take a look at a sample.
|
||||
|
||||
```python
|
||||
idx = 0
|
||||
print(f"Original caption: {dataset[idx]['input']}")
|
||||
print(f"Edit instruction: {dataset[idx]['edit']}")
|
||||
print(f"Modified caption: {dataset[idx]['output']}")
|
||||
```
|
||||
|
||||
```bash
|
||||
Original caption: 2. FAROE ISLANDS: An archipelago of 18 mountainous isles in the North Atlantic Ocean between Norway and Iceland, the Faroe Islands has 'everything you could hope for', according to Big 7 Travel. It boasts 'crystal clear waterfalls, rocky cliffs that seem to jut out of nowhere and velvety green hills'
|
||||
Edit instruction: make the isles all white marble
|
||||
Modified caption: 2. WHITE MARBLE ISLANDS: An archipelago of 18 mountainous white marble isles in the North Atlantic Ocean between Norway and Iceland, the White Marble Islands has 'everything you could hope for', according to Big 7 Travel. It boasts 'crystal clear waterfalls, rocky cliffs that seem to jut out of nowhere and velvety green hills'
|
||||
```
|
||||
|
||||
And here is the image:
|
||||
|
||||
```python
|
||||
dataset[idx]["image"]
|
||||
```
|
||||
|
||||

|
||||
|
||||
We will first edit the images of our dataset with the edit instruction and compute the directional similarity.
|
||||
|
||||
Let's first load the [`StableDiffusionInstructPix2PixPipeline`]:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
|
||||
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
|
||||
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
```
|
||||
|
||||
Now, we perform the edits:
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
|
||||
def edit_image(input_image, instruction):
|
||||
image = instruct_pix2pix_pipeline(
|
||||
instruction,
|
||||
image=input_image,
|
||||
output_type="numpy",
|
||||
generator=generator,
|
||||
).images[0]
|
||||
return image
|
||||
|
||||
|
||||
input_images = []
|
||||
original_captions = []
|
||||
modified_captions = []
|
||||
edited_images = []
|
||||
|
||||
for idx in range(len(dataset)):
|
||||
input_image = dataset[idx]["image"]
|
||||
edit_instruction = dataset[idx]["edit"]
|
||||
edited_image = edit_image(input_image, edit_instruction)
|
||||
|
||||
input_images.append(np.array(input_image))
|
||||
original_captions.append(dataset[idx]["input"])
|
||||
modified_captions.append(dataset[idx]["output"])
|
||||
edited_images.append(edited_image)
|
||||
```
|
||||
|
||||
To measure the directional similarity, we first load CLIP's image and text encoders.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
CLIPTokenizer,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPVisionModelWithProjection,
|
||||
CLIPImageProcessor,
|
||||
)
|
||||
|
||||
clip_id = "openai/clip-vit-large-patch14"
|
||||
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
|
||||
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
```
|
||||
|
||||
Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/pix2pix#diffusers.StableDiffusionInstructPix2PixPipeline.text_encoder).
|
||||
|
||||
Next, we prepare a PyTorch `nn.module` to compute directional similarity:
|
||||
|
||||
```python
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class DirectionalSimilarity(nn.Module):
|
||||
def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
|
||||
super().__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.text_encoder = text_encoder
|
||||
self.image_processor = image_processor
|
||||
self.image_encoder = image_encoder
|
||||
|
||||
def preprocess_image(self, image):
|
||||
image = self.image_processor(image, return_tensors="pt")["pixel_values"]
|
||||
return {"pixel_values": image.to(device)}
|
||||
|
||||
def tokenize_text(self, text):
|
||||
inputs = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
return {"input_ids": inputs.input_ids.to(device)}
|
||||
|
||||
def encode_image(self, image):
|
||||
preprocessed_image = self.preprocess_image(image)
|
||||
image_features = self.image_encoder(**preprocessed_image).image_embeds
|
||||
image_features = image_features / image_features.norm(dim=1, keepdim=True)
|
||||
return image_features
|
||||
|
||||
def encode_text(self, text):
|
||||
tokenized_text = self.tokenize_text(text)
|
||||
text_features = self.text_encoder(**tokenized_text).text_embeds
|
||||
text_features = text_features / text_features.norm(dim=1, keepdim=True)
|
||||
return text_features
|
||||
|
||||
def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
|
||||
sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
|
||||
return sim_direction
|
||||
|
||||
def forward(self, image_one, image_two, caption_one, caption_two):
|
||||
img_feat_one = self.encode_image(image_one)
|
||||
img_feat_two = self.encode_image(image_two)
|
||||
text_feat_one = self.encode_text(caption_one)
|
||||
text_feat_two = self.encode_text(caption_two)
|
||||
directional_similarity = self.compute_directional_similarity(
|
||||
img_feat_one, img_feat_two, text_feat_one, text_feat_two
|
||||
)
|
||||
return directional_similarity
|
||||
```
|
||||
|
||||
Let's put `DirectionalSimilarity` to use now.
|
||||
|
||||
```python
|
||||
dir_similarity = DirectionalSimilarity(tokenizer, text_encoder, image_processor, image_encoder)
|
||||
scores = []
|
||||
|
||||
for i in range(len(input_images)):
|
||||
original_image = input_images[i]
|
||||
original_caption = original_captions[i]
|
||||
edited_image = edited_images[i]
|
||||
modified_caption = modified_captions[i]
|
||||
|
||||
similarity_score = dir_similarity(original_image, edited_image, original_caption, modified_caption)
|
||||
scores.append(float(similarity_score.detach().cpu()))
|
||||
|
||||
print(f"CLIP directional similarity: {np.mean(scores)}")
|
||||
# CLIP directional similarity: 0.0797976553440094
|
||||
```
|
||||
|
||||
Like the CLIP Score, the higher the CLIP directional similarity, the better it is.
|
||||
|
||||
It should be noted that the `StableDiffusionInstructPix2PixPipeline` exposes two arguments, namely, `image_guidance_scale` and `guidance_scale` that let you control the quality of the final edited image. We encourage you to experiment with these two arguments and see the impact of that on the directional similarity.
|
||||
|
||||
We can extend the idea of this metric to measure how similar the original image and edited version are. To do that, we can just do `F.cosine_similarity(img_feat_two, img_feat_one)`. For these kinds of edits, we would still want the primary semantics of the images to be preserved as much as possible, i.e., a high similarity score.
|
||||
|
||||
We can use these metrics for similar pipelines such as the[`StableDiffusionPix2PixZeroPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/pix2pix_zero#diffusers.StableDiffusionPix2PixZeroPipeline)`.
|
||||
|
||||
<Tip>
|
||||
|
||||
Both CLIP score and CLIP direction similarity rely on the CLIP model, which can make the evaluations biased.
|
||||
|
||||
</Tip>
|
||||
|
||||
***Extending metrics like IS, FID (discussed later), or KID can be difficult*** when the model under evaluation was pre-trained on a large image-captioning dataset (such as the [LAION-5B dataset](https://laion.ai/blog/laion-5b/)). This is because underlying these metrics is an InceptionNet (pre-trained on the ImageNet-1k dataset) used for extracting intermediate image features. The pre-training dataset of Stable Diffusion may have limited overlap with the pre-training dataset of InceptionNet, so it is not a good candidate here for feature extraction.
|
||||
|
||||
***Using the above metrics helps evaluate models that are class-conditioned. For example, [DiT](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/overview). It was pre-trained being conditioned on the ImageNet-1k classes.***
|
||||
|
||||
### Class-conditioned image generation
|
||||
|
||||
Class-conditioned generative models are usually pre-trained on a class-labeled dataset such as [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k). Popular metrics for evaluating these models include Fréchet Inception Distance (FID), Kernel Inception Distance (KID), and Inception Score (IS). In this document, we focus on FID ([Heusel et al.](https://arxiv.org/abs/1706.08500)). We show how to compute it with the [`DiTPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/dit), which uses the [DiT model](https://arxiv.org/abs/2212.09748) under the hood.
|
||||
|
||||
FID aims to measure how similar are two datasets of images. As per [this resource](https://mmgeneration.readthedocs.io/en/latest/quick_run.html#fid):
|
||||
|
||||
> Fréchet Inception Distance is a measure of similarity between two datasets of images. It was shown to correlate well with the human judgment of visual quality and is most often used to evaluate the quality of samples of Generative Adversarial Networks. FID is calculated by computing the Fréchet distance between two Gaussians fitted to feature representations of the Inception network.
|
||||
|
||||
These two datasets are essentially the dataset of real images and the dataset of fake images (generated images in our case). FID is usually calculated with two large datasets. However, for this document, we will work with two mini datasets.
|
||||
|
||||
Let's first download a few images from the ImageNet-1k training set:
|
||||
|
||||
```python
|
||||
from zipfile import ZipFile
|
||||
import requests
|
||||
|
||||
|
||||
def download(url, local_filepath):
|
||||
r = requests.get(url)
|
||||
with open(local_filepath, "wb") as f:
|
||||
f.write(r.content)
|
||||
return local_filepath
|
||||
|
||||
|
||||
dummy_dataset_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/sample-imagenet-images.zip"
|
||||
local_filepath = download(dummy_dataset_url, dummy_dataset_url.split("/")[-1])
|
||||
|
||||
with ZipFile(local_filepath, "r") as zipper:
|
||||
zipper.extractall(".")
|
||||
```
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
dataset_path = "sample-imagenet-images"
|
||||
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
|
||||
|
||||
real_images = [np.array(Image.open(path).convert("RGB")) for path in image_paths]
|
||||
```
|
||||
|
||||
These are 10 images from the following Imagenet-1k classes: "cassette_player", "chain_saw" (x2), "church", "gas_pump" (x3), "parachute" (x2), and "tench".
|
||||
|
||||
<p align="center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/real-images.png" alt="real-images"><br>
|
||||
<em>Real images.</em>
|
||||
</p>
|
||||
|
||||
Now that the images are loaded, let's apply some lightweight pre-processing on them to use them for FID calculation.
|
||||
|
||||
```python
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
def preprocess_image(image):
|
||||
image = torch.tensor(image).unsqueeze(0)
|
||||
image = image.permute(0, 3, 1, 2) / 255.0
|
||||
return F.center_crop(image, (256, 256))
|
||||
|
||||
|
||||
real_images = torch.cat([preprocess_image(image) for image in real_images])
|
||||
print(real_images.shape)
|
||||
# torch.Size([10, 3, 256, 256])
|
||||
```
|
||||
|
||||
We now load the [`DiTPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/dit) to generate images conditioned on the above-mentioned classes.
|
||||
|
||||
```python
|
||||
from diffusers import DiTPipeline, DPMSolverMultistepScheduler
|
||||
|
||||
dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
|
||||
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
|
||||
dit_pipeline = dit_pipeline.to("cuda")
|
||||
|
||||
words = [
|
||||
"cassette player",
|
||||
"chainsaw",
|
||||
"chainsaw",
|
||||
"church",
|
||||
"gas pump",
|
||||
"gas pump",
|
||||
"gas pump",
|
||||
"parachute",
|
||||
"parachute",
|
||||
"tench",
|
||||
]
|
||||
|
||||
class_ids = dit_pipeline.get_label_ids(words)
|
||||
output = dit_pipeline(class_labels=class_ids, generator=generator, output_type="numpy")
|
||||
|
||||
fake_images = output.images
|
||||
fake_images = torch.tensor(fake_images)
|
||||
fake_images = fake_images.permute(0, 3, 1, 2)
|
||||
print(fake_images.shape)
|
||||
# torch.Size([10, 3, 256, 256])
|
||||
```
|
||||
|
||||
Now, we can compute the FID using [`torchmetrics`](https://torchmetrics.readthedocs.io/).
|
||||
|
||||
```python
|
||||
from torchmetrics.image.fid import FrechetInceptionDistance
|
||||
|
||||
fid = FrechetInceptionDistance(normalize=True)
|
||||
fid.update(real_images, real=True)
|
||||
fid.update(fake_images, real=False)
|
||||
|
||||
print(f"FID: {float(fid.compute())}")
|
||||
# FID: 177.7147216796875
|
||||
```
|
||||
|
||||
The lower the FID, the better it is. Several things can influence FID here:
|
||||
|
||||
- Number of images (both real and fake)
|
||||
- Randomness induced in the diffusion process
|
||||
- Number of inference steps in the diffusion process
|
||||
- The scheduler being used in the diffusion process
|
||||
|
||||
For the last two points, it is, therefore, a good practice to run the evaluation across different seeds and inference steps, and then report an average result.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
FID results tend to be fragile as they depend on a lot of factors:
|
||||
|
||||
* The specific Inception model used during computation.
|
||||
* The implementation accuracy of the computation.
|
||||
* The image format (not the same if we start from PNGs vs JPGs).
|
||||
|
||||
Keeping that in mind, FID is often most useful when comparing similar runs, but it is
|
||||
hard to to reproduce paper results unless the authors carefully disclose the FID
|
||||
measurement code.
|
||||
|
||||
These points apply to other related metrics too, such as KID and IS.
|
||||
|
||||
</Tip>
|
||||
|
||||
As a final step, let's visually inspect the `fake_images`.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/fake-images.png" alt="fake-images"><br>
|
||||
<em>Fake images.</em>
|
||||
</p>
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -12,99 +12,6 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Philosophy
|
||||
|
||||
🧨 Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
|
||||
Its purpose is to serve as a **modular toolbox** for both inference and training.
|
||||
|
||||
We aim at building a library that stands the test of time and therefore take API design very seriously.
|
||||
|
||||
In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:
|
||||
|
||||
## Usability over Performance
|
||||
|
||||
- While Diffusers has many built-in performance-enhancing features (see [Memory and Speed](https://huggingface.co/docs/diffusers/optimization/fp16)), models are always loaded with the highest precision and lowest optimization. Therefore, by default diffusion pipelines are always instantiated on CPU with float32 precision if not otherwise defined by the user. This ensures usability across different platforms and accelerators and means that no complex installations are required to run the library.
|
||||
- Diffusers aim at being a **light-weight** package and therefore has very few required dependencies, but many soft dependencies that can improve performance (such as `accelerate`, `safetensors`, `onnx`, etc...). We strive to keep the library as lightweight as possible so that it can be added without much concern as a dependency on other packages.
|
||||
- Diffusers prefers simple, self-explainable code over condensed, magic code. This means that short-hand code syntaxes such as lambda functions, and advanced PyTorch operators are often not desired.
|
||||
|
||||
## Simple over easy
|
||||
|
||||
As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library:
|
||||
- We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management.
|
||||
- Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible.
|
||||
- Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers.
|
||||
- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training
|
||||
is very simple thanks to diffusers' ability to separate single components of the diffusion pipeline.
|
||||
|
||||
## Tweakable, contributor-friendly over abstraction
|
||||
|
||||
For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself).
|
||||
In short, just like Transformers does for modeling files, diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers.
|
||||
Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable.
|
||||
**However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because:
|
||||
- Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions.
|
||||
- Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions.
|
||||
- Open-source libraries rely on community contributions and therefore must build a library that is easy to contribute to. The more abstract the code, the more dependencies, the harder to read, and the harder to contribute to. Contributors simply stop contributing to very abstract libraries out of fear of breaking vital functionality. If contributing to a library cannot break other fundamental code, not only is it more inviting for potential new contributors, but it is also easier to review and contribute to multiple parts in parallel.
|
||||
|
||||
At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look
|
||||
at [this blog post](https://huggingface.co/blog/transformers-design-philosophy).
|
||||
|
||||
In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such
|
||||
as [DDPM](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [UnCLIP (Dalle-2)](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/unclip#overview) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models#diffusers.UNet2DConditionModel).
|
||||
|
||||
Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗.
|
||||
We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️ to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=).
|
||||
|
||||
## Design Philosophy in Details
|
||||
|
||||
Now, let's look a bit into the nitty-gritty details of the design philosophy. Diffusers essentially consist of three major classes, [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines), [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models), and [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
|
||||
Let's walk through more in-detail design decisions for each class.
|
||||
|
||||
### Pipelines
|
||||
|
||||
Pipelines are designed to be easy to use (therefore do not follow [*Simple over easy*](#simple-over-easy) 100%)), are not feature complete, and should loosely be seen as examples of how to use [models](#models) and [schedulers](#schedulers) for inference.
|
||||
|
||||
The following design principles are followed:
|
||||
- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as it’s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [#Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251).
|
||||
- Pipelines all inherit from [`DiffusionPipeline`]
|
||||
- Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function.
|
||||
- Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function.
|
||||
- Pipelines should be used **only** for inference.
|
||||
- Pipelines should be very readable, self-explanatory, and easy to tweak.
|
||||
- Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
|
||||
- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner)
|
||||
- Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
|
||||
- Pipelines should be named after the task they are intended to solve.
|
||||
- In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
|
||||
|
||||
### Models
|
||||
|
||||
Models are designed as configurable toolboxes that are natural extensions of [PyTorch's Module class](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). They only partly follow the **single-file policy**.
|
||||
|
||||
The following design principles are followed:
|
||||
- Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
|
||||
- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc...
|
||||
- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
|
||||
- Models intend to expose complexity, just like PyTorch's module does, and give clear error messages.
|
||||
- Models all inherit from `ModelMixin` and `ConfigMixin`.
|
||||
- Models can be optimized for performance when it doesn’t demand major code changes, keeps backward compatibility, and gives significant memory or compute gain.
|
||||
- Models should by default have the highest precision and lowest performance setting.
|
||||
- To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different.
|
||||
- Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work.
|
||||
- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and
|
||||
readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
||||
|
||||
### Schedulers
|
||||
|
||||
Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**.
|
||||
|
||||
The following design principles are followed:
|
||||
- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers).
|
||||
- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained.
|
||||
- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper).
|
||||
- If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism.
|
||||
- Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`.
|
||||
- Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./using-diffusers/schedulers.mdx).
|
||||
- Every scheduler has to have a `set_num_inference_steps`, and a `step` function. `set_num_inference_steps(...)` has to be called before every denoising process, *i.e.* before `step(...)` is called.
|
||||
- Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon
|
||||
- The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
|
||||
- Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
|
||||
- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
|
||||
- Readability and clarity are preferred over highly optimized code. A strong importance is put on providing readable, intuitive and elementary code design. *E.g.*, the provided [schedulers](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers) are separated from the provided [models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and use well-commented code that can be read alongside the original paper.
|
||||
- Diffusers is **modality independent** and focuses on providing pretrained models and tools to build systems that generate **continuous outputs**, *e.g.* vision and audio. This is one of the guiding goals even if the initial pipelines are devoted to vision tasks.
|
||||
- Diffusion models and schedulers are provided as concise, elementary building blocks. In contrast, diffusion pipelines are a collection of end-to-end diffusion systems that can be used out-of-the-box, should stay as close as possible to their original implementations and can include components of other libraries, such as text encoders. Examples of diffusion pipelines are [Glide](https://github.com/openai/glide-text2im), [Latent Diffusion](https://github.com/CompVis/latent-diffusion) and [Stable Diffusion](https://github.com/compvis/stable-diffusion).
|
||||
|
||||
+41
-68
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -16,76 +16,49 @@ specific language governing permissions and limitations under the License.
|
||||
<br>
|
||||
</p>
|
||||
|
||||
# Diffusers
|
||||
# 🧨 Diffusers
|
||||
|
||||
🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or want to train your own diffusion model, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on [usability over performance](conceptual/philosophy#usability-over-performance), [simple over easy](conceptual/philosophy#simple-over-easy), and [customizability over abstractions](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
|
||||
🤗 Diffusers provides pretrained vision and audio diffusion models, and serves as a modular toolbox for inference and training.
|
||||
|
||||
The library has three main components:
|
||||
More precisely, 🤗 Diffusers offers:
|
||||
|
||||
- State-of-the-art [diffusion pipelines](api/pipelines/overview) for inference with just a few lines of code.
|
||||
- Interchangeable [noise schedulers](api/schedulers/overview) for balancing trade-offs between generation speed and quality.
|
||||
- Pretrained [models](api/models) that can be used as building blocks, and combined with schedulers, for creating your own end-to-end diffusion systems.
|
||||
- State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [**Using Diffusers**](./using-diffusers/conditional_image_generation)) or have a look at [**Pipelines**](#pipelines) to get an overview of all supported pipelines and their corresponding papers.
|
||||
- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers/overview).
|
||||
- Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system. See [**Models**](./api/models) for more details
|
||||
- Training examples to show how to train the most popular diffusion model tasks. For more information see [**Training**](./training/overview).
|
||||
|
||||
<div class="mt-10">
|
||||
<div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./tutorials/tutorial_overview"
|
||||
><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
|
||||
<p class="text-gray-700">Learn the fundamental skills you need to start generating outputs, build your own diffusion system, and train a diffusion model. We recommend starting here if you're using 🤗 Diffusers for the first time!</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./using-diffusers/loading_overview"
|
||||
><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
|
||||
<p class="text-gray-700">Practical guides for helping you load pipelines, models, and schedulers. You'll also learn how to use pipelines for specific tasks, control how outputs are generated, optimize for inference speed, and different training techniques.</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual/philosophy"
|
||||
><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
|
||||
<p class="text-gray-700">Understand why the library was designed the way it was, and learn more about the ethical guidelines and safety implementations for using the library.</p>
|
||||
</a>
|
||||
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./api/models"
|
||||
><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
|
||||
<p class="text-gray-700">Technical descriptions of how 🤗 Diffusers classes and methods work.</p>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
## 🧨 Diffusers Pipelines
|
||||
|
||||
## Supported pipelines
|
||||
The following table summarizes all officially supported pipelines, their corresponding paper, and if
|
||||
available a colab notebook to directly try them out.
|
||||
|
||||
| Pipeline | Paper/Repository | Tasks |
|
||||
|---|---|:---:|
|
||||
| [alt_diffusion](./api/pipelines/alt_diffusion) | [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
|
||||
| [audio_diffusion](./api/pipelines/audio_diffusion) | [Audio Diffusion](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation |
|
||||
| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
|
||||
| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
| [ddim](./api/pipelines/ddim) | [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
|
||||
| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
|
||||
| [paint_by_example](./api/pipelines/paint_by_example) | [Paint by Example: Exemplar-based Image Editing with Diffusion Models](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
|
||||
| [pndm](./api/pipelines/pndm) | [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./api/pipelines/score_sde_ve) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./api/pipelines/score_sde_vp) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [Semantic Guidance](https://arxiv.org/abs/2301.12247) | Text-Guided Generation |
|
||||
| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation |
|
||||
| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [MultiDiffusion](https://multidiffusion.github.io/) | Text-to-Panorama Generation |
|
||||
| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) | Text-Guided Image Editing|
|
||||
| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [Zero-shot Image-to-Image Translation](https://pix2pixzero.github.io/) | Text-Guided Image Editing |
|
||||
| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation |
|
||||
| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation |
|
||||
| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation |
|
||||
| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [Stable Diffusion Latent Upscaler](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Depth-Conditional Stable Diffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [Safe Stable Diffusion](https://arxiv.org/abs/2211.05105) | Text-Guided Generation |
|
||||
| [stable_unclip](./stable_unclip) | Stable unCLIP | Text-to-Image Generation |
|
||||
| [stable_unclip](./stable_unclip) | Stable unCLIP | Image-to-Image Text-Guided Generation |
|
||||
| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125)(implementation by [kakaobrain](https://github.com/kakaobrain/karlo)) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
|
||||
| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
| Pipeline | Paper | Tasks | Colab
|
||||
|---|---|:---:|:---:|
|
||||
| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
|
||||
| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
|
||||
| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
|
||||
| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
|
||||
| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
|
||||
| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting |
|
||||
| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation |
|
||||
| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation |
|
||||
| [stable_diffusion](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [stable_diffusion](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting |
|
||||
| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
|
||||
| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
|
||||
| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation |
|
||||
| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
|
||||
| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
|
||||
**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -20,6 +20,7 @@ We'll discuss how the following settings impact performance and memory.
|
||||
| ---------------- | ------- | ------- |
|
||||
| original | 9.50s | x1 |
|
||||
| cuDNN auto-tuner | 9.37s | x1.01 |
|
||||
| autocast (fp16) | 5.47s | x1.74 |
|
||||
| fp16 | 3.61s | x2.63 |
|
||||
| channels last | 3.30s | x2.88 |
|
||||
| traced UNet | 3.21s | x2.96 |
|
||||
@@ -53,9 +54,27 @@ import torch
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
```
|
||||
|
||||
## Automatic mixed precision (AMP)
|
||||
|
||||
If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an `autocast` context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:
|
||||
|
||||
```Python
|
||||
from torch import autocast
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
with autocast("cuda"):
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
|
||||
|
||||
## Half precision weights
|
||||
|
||||
To save more GPU memory and get more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
|
||||
To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
|
||||
|
||||
```Python
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
@@ -69,11 +88,6 @@ prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
It is strongly discouraged to make use of [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than using pure
|
||||
float16 precision.
|
||||
</Tip>
|
||||
|
||||
## Sliced attention for additional memory savings
|
||||
|
||||
For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
|
||||
@@ -133,38 +147,9 @@ images = pipe([prompt] * 32).images
|
||||
You may see a small performance boost in VAE decode on multi-image batches. There should be no performance impact on single-image batches.
|
||||
|
||||
|
||||
## Tiled VAE decode and encode for large images
|
||||
|
||||
Tiled VAE processing makes it possible to work with large images on limited VRAM. For example, generating 4k images in 8GB of VRAM. Tiled VAE decoder splits the image into overlapping tiles, decodes the tiles, and blends the outputs to make the final image.
|
||||
|
||||
You want to couple this with [`~StableDiffusionPipeline.enable_attention_slicing`] or [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
|
||||
|
||||
To use tiled VAE processing, invoke [`~StableDiffusionPipeline.enable_vae_tiling`] in your pipeline before inference. For example:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe = pipe.to("cuda")
|
||||
prompt = "a beautiful landscape photograph"
|
||||
pipe.enable_vae_tiling()
|
||||
pipe.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
|
||||
```
|
||||
|
||||
The output image will have some tile-to-tile tone variation from the tiles having separate decoders, but you shouldn't see sharp seams between the tiles. The tiling is turned off for images that are 512x512 or smaller.
|
||||
|
||||
|
||||
<a name="sequential_offloading"></a>
|
||||
## Offloading to CPU with accelerate for memory savings
|
||||
|
||||
For additional memory savings, you can offload the weights to CPU and only load them to GPU when performing the forward pass.
|
||||
For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
|
||||
|
||||
To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
|
||||
|
||||
@@ -177,21 +162,16 @@ pipe = StableDiffusionPipeline.from_pretrained(
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
And you can get the memory consumption to < 3GB.
|
||||
And you can get the memory consumption to < 2GB.
|
||||
|
||||
Note that this method works at the submodule level, not on whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different submodules of the UNet are sequentially onloaded and then offloaded as they are needed, so the number of memory transfers is large.
|
||||
|
||||
<Tip>
|
||||
Consider using <a href="#model_offloading">model offloading</a> as another point in the optimization space: it will be much faster, but memory savings won't be as large.
|
||||
</Tip>
|
||||
|
||||
It is also possible to chain offloading with attention slicing for minimal memory consumption (< 2GB).
|
||||
If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
@@ -202,6 +182,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
@@ -210,57 +191,6 @@ pipe.enable_attention_slicing(1)
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.
|
||||
|
||||
|
||||
<a name="model_offloading"></a>
|
||||
## Model offloading for fast inference and memory savings
|
||||
|
||||
[Sequential CPU offloading](#sequential_offloading), as discussed in the previous section, preserves a lot of memory but makes inference slower, because submodules are moved to GPU as needed, and immediately returned to CPU when a new module runs.
|
||||
|
||||
Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent _modules_. This results in a negligible impact on inference time (compared with moving the pipeline to `cuda`), while still providing some memory savings.
|
||||
|
||||
In this scenario, only one of the main components of the pipeline (typically: text encoder, unet and vae)
|
||||
will be in the GPU while the others wait in the CPU. Compoments like the UNet that run for multiple iterations will stay on GPU until they are no longer needed.
|
||||
|
||||
This feature can be enabled by invoking `enable_model_cpu_offload()` on the pipeline, as shown below.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_model_cpu_offload()
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
This is also compatible with attention slicing for additional memory savings.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.enable_attention_slicing(1)
|
||||
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
This feature requires `accelerate` version 0.17.0 or larger.
|
||||
</Tip>
|
||||
|
||||
## Using Channels Last memory format
|
||||
|
||||
Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
|
||||
@@ -294,7 +224,6 @@ torch.set_grad_enabled(False)
|
||||
n_experiments = 2
|
||||
unet_runs_per_experiment = 50
|
||||
|
||||
|
||||
# load inputs
|
||||
def generate_inputs():
|
||||
sample = torch.randn(2, 4, 64, 64).half().cuda()
|
||||
@@ -373,8 +302,6 @@ pipe = StableDiffusionPipeline.from_pretrained(
|
||||
|
||||
# use jitted unet
|
||||
unet_traced = torch.jit.load("unet_traced.pt")
|
||||
|
||||
|
||||
# del pipe.unet
|
||||
class TracedUNet(torch.nn.Module):
|
||||
def __init__(self):
|
||||
@@ -430,4 +357,4 @@ with torch.inference_mode():
|
||||
|
||||
# optional: You can disable it via
|
||||
# pipe.disable_xformers_memory_efficient_attention()
|
||||
```
|
||||
```
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -21,13 +21,13 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Stable Diffusion Inference
|
||||
|
||||
The snippet below demonstrates how to use the ONNX runtime. You need to use `OnnxStableDiffusionPipeline` instead of `StableDiffusionPipeline`. You also need to download the weights from the `onnx` branch of the repository, and indicate the runtime provider you want to use.
|
||||
The snippet below demonstrates how to use the ONNX runtime. You need to use `StableDiffusionOnnxPipeline` instead of `StableDiffusionPipeline`. You also need to download the weights from the `onnx` branch of the repository, and indicate the runtime provider you want to use.
|
||||
|
||||
```python
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from diffusers import OnnxStableDiffusionPipeline
|
||||
from diffusers import StableDiffusionOnnxPipeline
|
||||
|
||||
pipe = OnnxStableDiffusionPipeline.from_pretrained(
|
||||
pipe = StableDiffusionOnnxPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
revision="onnx",
|
||||
provider="CUDAExecutionProvider",
|
||||
@@ -37,37 +37,6 @@ prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
The snippet below demonstrates how to use the ONNX runtime with the Stable Diffusion upscaling pipeline.
|
||||
|
||||
```python
|
||||
from diffusers import OnnxStableDiffusionPipeline, OnnxStableDiffusionUpscalePipeline
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
steps = 50
|
||||
|
||||
txt2img = OnnxStableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
revision="onnx",
|
||||
provider="CUDAExecutionProvider",
|
||||
)
|
||||
small_image = txt2img(
|
||||
prompt,
|
||||
num_inference_steps=steps,
|
||||
).images[0]
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
upscale = OnnxStableDiffusionUpscalePipeline.from_pretrained(
|
||||
"ssube/stable-diffusion-x4-upscaler-onnx",
|
||||
provider="CUDAExecutionProvider",
|
||||
)
|
||||
large_image = upscale(
|
||||
prompt,
|
||||
small_image,
|
||||
generator=generator,
|
||||
num_inference_steps=steps,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
## Known Issues
|
||||
|
||||
- Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -10,30 +10,6 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# OpenVINO
|
||||
|
||||
# How to use OpenVINO for inference
|
||||
|
||||
🤗 [Optimum](https://github.com/huggingface/optimum-intel) provides a Stable Diffusion pipeline compatible with OpenVINO. You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices).
|
||||
|
||||
## Installation
|
||||
|
||||
Install 🤗 Optimum Intel with the following command:
|
||||
|
||||
```
|
||||
pip install optimum["openvino"]
|
||||
```
|
||||
|
||||
## Stable Diffusion Inference
|
||||
|
||||
To load an OpenVINO model and run inference with OpenVINO Runtime, you need to replace `StableDiffusionPipeline` with `OVStableDiffusionPipeline`. In case you want to load a PyTorch model and convert it to the OpenVINO format on-the-fly, you can set `export=True`.
|
||||
|
||||
```python
|
||||
from optimum.intel.openvino import OVStableDiffusionPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
images = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
You can find more examples in [optimum documentation](https://huggingface.co/docs/optimum/intel/inference#export-and-inference-of-stable-diffusion-models).
|
||||
Under construction 🚧
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🧨 Diffuser's goal is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
|
||||
|
||||
This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You can also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
|
||||
@@ -1,208 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Accelerated PyTorch 2.0 support in Diffusers
|
||||
|
||||
Starting from version `0.13.0`, Diffusers supports the latest optimization from the upcoming [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) release. These include:
|
||||
1. Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies required.
|
||||
2. [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) support for extra performance boost when individual models are compiled.
|
||||
|
||||
|
||||
## Installation
|
||||
To benefit from the accelerated transformers implementation and `torch.compile`, we will need to install the nightly version of PyTorch, as the stable version is yet to be released. The first step is to install CUDA 11.7 or CUDA 11.8,
|
||||
as PyTorch 2.0 does not support the previous versions. Once CUDA is installed, torch nightly can be installed using:
|
||||
|
||||
```bash
|
||||
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu117
|
||||
```
|
||||
|
||||
## Using accelerated transformers and torch.compile.
|
||||
|
||||
|
||||
1. **Accelerated Transformers implementation**
|
||||
|
||||
PyTorch 2.0 includes an optimized and memory-efficient attention implementation through the [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) function, which automatically enables several optimizations depending on the inputs and the GPU type. This is similar to the `memory_efficient_attention` from [xFormers](https://github.com/facebookresearch/xformers), but built natively into PyTorch.
|
||||
|
||||
These optimizations will be enabled by default in Diffusers if PyTorch 2.0 is installed and if `torch.nn.functional.scaled_dot_product_attention` is available. To use it, just install `torch 2.0` as suggested above and simply use the pipeline. For example:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
If you want to enable it explicitly (which is not required), you can do so as shown below.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.models.attention_processor import AttnProcessor2_0
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
pipe.unet.set_attn_processor(AttnProcessor2_0())
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
This should be as fast and memory efficient as `xFormers`. More details [in our benchmark](#benchmark).
|
||||
|
||||
|
||||
2. **torch.compile**
|
||||
|
||||
To get an additional speedup, we can use the new `torch.compile` feature. To do so, we simply wrap our `unet` with `torch.compile`. For more information and different options, refer to the
|
||||
[torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
|
||||
"cuda"
|
||||
)
|
||||
pipe.unet = torch.compile(pipe.unet)
|
||||
|
||||
batch_size = 10
|
||||
prompt = "A photo of an astronaut riding a horse on marse."
|
||||
images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images
|
||||
```
|
||||
|
||||
Depending on the type of GPU, `compile()` can yield between 2-9% of _additional speed-up_ over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).
|
||||
|
||||
Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.
|
||||
|
||||
|
||||
## Benchmark
|
||||
|
||||
We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, `torch.nn.functional.scaled_dot_product_attention` and `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
|
||||
For the benchmark we used the the [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) model with 50 steps. The `xFormers` benchmark is done using the `torch==1.13.1` version, while the accelerated transformers optimizations are tested using nightly versions of PyTorch 2.0. The tables below summarize the results we got.
|
||||
|
||||
The `Speed over xformers` columns denote the speed-up gained over `xFormers` using the `torch.compile+torch.nn.functional.scaled_dot_product_attention`.
|
||||
|
||||
|
||||
### FP16 benchmark
|
||||
|
||||
The table below shows the benchmark results for inference using `fp16`. As we can see, `torch.nn.functional.scaled_dot_product_attention` is as fast as `xFormers` (sometimes slightly faster/slower) on all the GPUs we tested.
|
||||
And using `torch.compile` gives further speed-up of up of 10% over `xFormers`, but it's mostly noticeable on the A100 GPU.
|
||||
|
||||
___The time reported is in seconds.___
|
||||
|
||||
| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| A100 | 10 | 12.02 | 8.7 | 8.79 | 7.89 | 9.31 |
|
||||
| A100 | 16 | 18.95 | 13.57 | 13.67 | 12.25 | 9.73 |
|
||||
| A100 | 32 (1) | OOM | 26.56 | 26.68 | 24.08 | 9.34 |
|
||||
| A100 | 64 | | 52.51 | 53.03 | 47.81 | 8.95 |
|
||||
| | | | | | | |
|
||||
| A10 | 4 | 13.94 | 9.81 | 10.01 | 9.35 | 4.69 |
|
||||
| A10 | 8 | 27.09 | 19 | 19.53 | 18.33 | 3.53 |
|
||||
| A10 | 10 | 33.69 | 23.53 | 24.19 | 22.52 | 4.29 |
|
||||
| A10 | 16 | OOM | 37.55 | 38.31 | 36.81 | 1.97 |
|
||||
| A10 | 32 (1) | | 77.19 | 78.43 | 76.64 | 0.71 |
|
||||
| A10 | 64 (1) | | 173.59 | 158.99 | 155.14 | 10.63 |
|
||||
| | | | | | | |
|
||||
| T4 | 4 | 38.81 | 30.09 | 29.74 | 27.55 | 8.44 |
|
||||
| T4 | 8 | OOM | 55.71 | 55.99 | 53.85 | 3.34 |
|
||||
| T4 | 10 | OOM | 68.96 | 69.86 | 65.35 | 5.23 |
|
||||
| T4 | 16 | OOM | 111.47 | 113.26 | 106.93 | 4.07 |
|
||||
| | | | | | | |
|
||||
| V100 | 4 | 9.84 | 8.16 | 8.09 | 7.65 | 6.25 |
|
||||
| V100 | 8 | OOM | 15.62 | 15.44 | 14.59 | 6.59 |
|
||||
| V100 | 10 | OOM | 19.52 | 19.28 | 18.18 | 6.86 |
|
||||
| V100 | 16 | OOM | 30.29 | 29.84 | 28.22 | 6.83 |
|
||||
| | | | | | | |
|
||||
| 3090 | 4 | 10.04 | 7.82 | 7.89 | 7.47 | 4.48 |
|
||||
| 3090 | 8 | 19.27 | 14.97 | 15.04 | 14.22 | 5.01 |
|
||||
| 3090 | 10| 24.08 | 18.7 | 18.7 | 17.69 | 5.40 |
|
||||
| 3090 | 16 | OOM | 29.06 | 29.06 | 28.2 | 2.96 |
|
||||
| 3090 | 32 (1) | | 58.05 | 58 | 54.88 | 5.46 |
|
||||
| 3090 | 64 (1) | | 126.54 | 126.03 | 117.33 | 7.28 |
|
||||
| | | | | | | |
|
||||
| 3090 Ti | 4 | 9.07 | 7.14 | 7.15 | 6.81 | 4.62 |
|
||||
| 3090 Ti | 8 | 17.51 | 13.65 | 13.72 | 12.99 | 4.84 |
|
||||
| 3090 Ti | 10 (2) | 21.79 | 16.85 | 16.93 | 16.02 | 4.93 |
|
||||
| 3090 Ti | 16 | OOM | 26.1 | 26.28 | 25.46 | 2.45 |
|
||||
| 3090 Ti | 32 (1) | | 51.78 | 52.04 | 49.15 | 5.08 |
|
||||
| 3090 Ti | 64 (1) | | 112.02 | 112.33 | 103.91 | 7.24 |
|
||||
| | | | | | | |
|
||||
| 4090 | 4 | 10.48 | 8.37 | 8.32 | 8.01 | 4.30 |
|
||||
| 4090 | 8 | 14.33 | 10.22 | 10.42 | 9.78 | 4.31 |
|
||||
| 4090 | 16 | | 17.07 | 17.46 | 17.15 | -0.47 |
|
||||
| 4090 | 32 (1) | | 39.03 | 39.86 | 37.97 | 2.72 |
|
||||
| 4090 | 64 (1) | | 77.29 | 79.44 | 77.67 | -0.49 |
|
||||
|
||||
|
||||
|
||||
### FP32 benchmark
|
||||
|
||||
The table below shows the benchmark results for inference using `fp32`. In this case, `torch.nn.functional.scaled_dot_product_attention` is faster than `xFormers` on all the GPUs we tested.
|
||||
|
||||
Using `torch.compile` in addition to the accelerated transformers implementation can yield up to 19% performance improvement over `xFormers` in Ampere and Ada cards, and up to 20% (Ampere) or 28% (Ada) over vanilla attention.
|
||||
|
||||
| GPU | Batch Size | Vanilla Attention | xFormers | PyTorch2.0 SDPA | SDPA + torch.compile | Speed over xformers (%) | Speed over vanilla (%) |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| A100 | 4 | 16.56 | 12.42 | 12.2 | 11.84 | 4.67 | 28.50 |
|
||||
| A100 | 10 | OOM | 29.93 | 29.44 | 28.5 | 4.78 | |
|
||||
| A100 | 16 | | 47.08 | 46.27 | 44.8 | 4.84 | |
|
||||
| A100 | 32 | | 92.89 | 91.34 | 88.35 | 4.89 | |
|
||||
| A100 | 64 | | 185.3 | 182.71 | 176.48 | 4.76 | |
|
||||
| | | | | | | |
|
||||
| A10 | 1 | 10.59 | 8.81 | 7.51 | 7.35 | 16.57 | 30.59 |
|
||||
| A10 | 4 | 34.77 | 27.63 | 22.77 | 22.07 | 20.12 | 36.53 |
|
||||
| A10 | 8 | | 56.19 | 43.53 | 43.86 | 21.94 | |
|
||||
| A10 | 16 | | 116.49 | 88.56 | 86.64 | 25.62 | |
|
||||
| A10 | 32 | | 221.95 | 175.74 | 168.18 | 24.23 | |
|
||||
| A10 | 48 | | 333.23 | 264.84 | | 20.52 | |
|
||||
| | | | | | | |
|
||||
| T4 | 1 | 28.2 | 24.49 | 23.93 | 23.56 | 3.80 | 16.45 |
|
||||
| T4 | 2 | 52.77 | 45.7 | 45.88 | 45.06 | 1.40 | 14.61 |
|
||||
| T4 | 4 | OOM | 85.72 | 85.78 | 84.48 | 1.45 | |
|
||||
| T4 | 8 | | 149.64 | 150.75 | 148.4 | 0.83 | |
|
||||
| | | | | | | |
|
||||
| V100 | 1 | 7.4 | 6.84 | 6.8 | 6.66 | 2.63 | 10.00 |
|
||||
| V100 | 2 | 13.85 | 12.81 | 12.66 | 12.35 | 3.59 | 10.83 |
|
||||
| V100 | 4 | OOM | 25.73 | 25.31 | 24.78 | 3.69 | |
|
||||
| V100 | 8 | | 43.95 | 43.37 | 42.25 | 3.87 | |
|
||||
| V100 | 16 | | 84.99 | 84.73 | 82.55 | 2.87 | |
|
||||
| | | | | | | |
|
||||
| 3090 | 1 | 7.09 | 6.78 | 6.11 | 6.03 | 11.06 | 14.95 |
|
||||
| 3090 | 4 | 22.69 | 21.45 | 18.67 | 18.09 | 15.66 | 20.27 |
|
||||
| 3090 | 8 | | 42.59 | 36.75 | 35.59 | 16.44 | |
|
||||
| 3090 | 16 | | 85.35 | 72.37 | 70.25 | 17.69 | |
|
||||
| 3090 | 32 (1) | | 162.05 | 138.99 | 134.53 | 16.98 | |
|
||||
| 3090 | 48 | | 241.91 | 207.75 | | 14.12 | |
|
||||
| | | | | | | |
|
||||
| 3090 Ti | 1 | 6.45 | 6.19 | 5.64 | 5.49 | 11.31 | 14.88 |
|
||||
| 3090 Ti | 4 | 20.32 | 19.31 | 16.9 | 16.37 | 15.23 | 19.44 |
|
||||
| 3090 Ti | 8 (2) | | 37.93 | 33.05 | 31.99 | 15.66 | |
|
||||
| 3090 Ti | 16 | | 75.37 | 65.25 | 64.32 | 14.66 | |
|
||||
| 3090 Ti | 32 (1) | | 142.55 | 124.44 | 120.74 | 15.30 | |
|
||||
| 3090 Ti | 48 | | 213.19 | 186.55 | | 12.50 | |
|
||||
| | | | | | | |
|
||||
| 4090 | 1 | 5.54 | 4.99 | 4.51 | 4.44 | 11.02 | 19.86 |
|
||||
| 4090 | 4 | 13.67 | 11.4 | 10.3 | 9.84 | 13.68 | 28.02 |
|
||||
| 4090 | 8 | | 19.79 | 17.13 | 16.19 | 18.19 | |
|
||||
| 4090 | 16 | | 38.62 | 33.14 | 32.31 | 16.34 | |
|
||||
| 4090 | 32 (1) | | 76.57 | 65.96 | 62.05 | 18.96 | |
|
||||
| 4090 | 48 | | 114.44 | 98.78 | | 13.68 | |
|
||||
|
||||
|
||||
|
||||
(1) Batch Size >= 32 requires enable_vae_slicing() because of https://github.com/pytorch/pytorch/issues/81665
|
||||
This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and batch size of 64
|
||||
|
||||
For more details about how this benchmark was run, please refer to [this PR](https://github.com/huggingface/diffusers/pull/2303).
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -14,22 +14,13 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
We recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for both inference and training. In our tests, the optimizations performed in the attention blocks allow for both faster speed and reduced memory consumption.
|
||||
|
||||
Starting from version `0.0.16` of xFormers, released on January 2023, installation can be easily performed using pre-built pip wheels:
|
||||
Installing xFormers has historically been a bit involved, as binary distributions were not always up to date. Fortunately, the project has [very recently](https://github.com/facebookresearch/xformers/pull/591) integrated a process to build pip wheels as part of the project's continuous integration, so this should improve a lot starting from xFormers version 0.0.16.
|
||||
|
||||
Until xFormers 0.0.16 is deployed, you can install pip wheels using [`TestPyPI`](https://test.pypi.org/project/formers/). These are the steps that worked for us in a Linux computer to install xFormers version 0.0.15:
|
||||
|
||||
```bash
|
||||
pip install xformers
|
||||
pip install pyre-extensions==0.0.23
|
||||
pip install -i https://test.pypi.org/simple/ formers==0.0.15.dev376
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
The xFormers PIP package requires the latest version of PyTorch (1.13.1 as of xFormers 0.0.16). If you need to use a previous version of PyTorch, then we recommend you install xFormers from source using [the project instructions](https://github.com/facebookresearch/xformers#installing-xformers).
|
||||
|
||||
</Tip>
|
||||
|
||||
After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption, as discussed [here](fp16#memory-efficient-attention).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or Dreambooth) in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
|
||||
|
||||
</Tip>
|
||||
We'll update these instructions when the wheels are published to the official PyPI repository.
|
||||
|
||||
+49
-232
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -10,25 +10,10 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Quicktour
|
||||
|
||||
Diffusion models are trained to denoise random Gaussian noise step-by-step to generate a sample of interest, such as an image or audio. This has sparked a tremendous amount of interest in generative AI, and you have probably seen examples of diffusion generated images on the internet. 🧨 Diffusers is a library aimed at making diffusion models widely accessible to everyone.
|
||||
|
||||
Whether you're a developer or an everyday user, this quicktour will introduce you to 🧨 Diffusers and help you get up and generating quickly! There are three main components of the library to know about:
|
||||
|
||||
* The [`DiffusionPipeline`] is a high-level end-to-end class designed to rapidly generate samples from pretrained diffusion models for inference.
|
||||
* Popular pretrained [model](./api/models) architectures and modules that can be used as building blocks for creating diffusion systems.
|
||||
* Many different [schedulers](./api/schedulers/overview) - algorithms that control how noise is added for training, and how to generate denoised images during inference.
|
||||
|
||||
The quicktour will show you how to use the [`DiffusionPipeline`] for inference, and then walk you through how to combine a model and scheduler to replicate what's happening inside the [`DiffusionPipeline`].
|
||||
|
||||
<Tip>
|
||||
|
||||
The quicktour is a simplified version of the introductory 🧨 Diffusers [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) to help you get started quickly. If you want to learn more about 🧨 Diffusers goal, design philosophy, and additional details about it's core API, check out the notebook!
|
||||
|
||||
</Tip>
|
||||
Get up and running with 🧨 Diffusers quickly!
|
||||
Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use [`DiffusionPipeline`] for inference.
|
||||
|
||||
Before you begin, make sure you have all the necessary libraries installed:
|
||||
|
||||
@@ -36,32 +21,32 @@ Before you begin, make sure you have all the necessary libraries installed:
|
||||
pip install --upgrade diffusers accelerate transformers
|
||||
```
|
||||
|
||||
- [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training.
|
||||
- [🤗 Transformers](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview).
|
||||
- [`accelerate`](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training
|
||||
- [`transformers`](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview)
|
||||
|
||||
## DiffusionPipeline
|
||||
|
||||
The [`DiffusionPipeline`] is the easiest way to use a pretrained diffusion system for inference. It is an end-to-end system containing the model and the scheduler. You can use the [`DiffusionPipeline`] out-of-the-box for many tasks. Take a look at the table below for some supported tasks, and for a complete list of supported tasks, check out the [🧨 Diffusers Summary](./api/pipelines/overview#diffusers-summary) table.
|
||||
The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference. You can use the [`DiffusionPipeline`] out-of-the-box for many tasks across different modalities. Take a look at the table below for some supported tasks:
|
||||
|
||||
| **Task** | **Description** | **Pipeline**
|
||||
|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|
|
||||
| Unconditional Image Generation | generate an image from Gaussian noise | [unconditional_image_generation](./using-diffusers/unconditional_image_generation) |
|
||||
| Unconditional Image Generation | generate an image from gaussian noise | [unconditional_image_generation](./using-diffusers/unconditional_image_generation`) |
|
||||
| Text-Guided Image Generation | generate an image given a text prompt | [conditional_image_generation](./using-diffusers/conditional_image_generation) |
|
||||
| Text-Guided Image-to-Image Translation | adapt an image guided by a text prompt | [img2img](./using-diffusers/img2img) |
|
||||
| Text-Guided Image-Inpainting | fill the masked part of an image given the image, the mask and a text prompt | [inpaint](./using-diffusers/inpaint) |
|
||||
| Text-Guided Depth-to-Image Translation | adapt parts of an image guided by a text prompt while preserving structure via depth estimation | [depth2img](./using-diffusers/depth2img) |
|
||||
| Text-Guided Depth-to-Image Translation | adapt parts of an image guided by a text prompt while preserving structure via depth estimation | [depth2image](./using-diffusers/depth2image) |
|
||||
|
||||
Start by creating an instance of a [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
|
||||
You can use the [`DiffusionPipeline`] for any [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) stored on the Hugging Face Hub.
|
||||
In this quicktour, you'll load the [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image generation.
|
||||
For more in-detail information on how diffusion pipelines function for the different tasks, please have a look at the [**Using Diffusers**](./using-diffusers/overview) section.
|
||||
|
||||
<Tip warning={true}>
|
||||
As an example, start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
|
||||
You can use the [`DiffusionPipeline`] for any [Diffusers' checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads).
|
||||
In this guide though, you'll use [`DiffusionPipeline`] for text-to-image generation with [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion).
|
||||
|
||||
For [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) models, please carefully read the [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) first before running the model. 🧨 Diffusers implements a [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to prevent offensive or harmful content, but the model's improved image generation capabilities can still produce potentially harmful content.
|
||||
For [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion), please carefully read its [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) before running the model.
|
||||
This is due to the improved image generation capabilities of the model and the potentially harmful content that could be produced with it.
|
||||
Please, head over to your stable diffusion model of choice, *e.g.* [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5), and read the license.
|
||||
|
||||
</Tip>
|
||||
|
||||
Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
|
||||
You can load the model as follows:
|
||||
|
||||
```python
|
||||
>>> from diffusers import DiffusionPipeline
|
||||
@@ -69,245 +54,77 @@ Load the model with the [`~DiffusionPipeline.from_pretrained`] method:
|
||||
>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
```
|
||||
|
||||
The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. You'll see that the Stable Diffusion pipeline is composed of the [`UNet2DConditionModel`] and [`PNDMScheduler`] among other things:
|
||||
|
||||
```py
|
||||
>>> pipeline
|
||||
StableDiffusionPipeline {
|
||||
"_class_name": "StableDiffusionPipeline",
|
||||
"_diffusers_version": "0.13.1",
|
||||
...,
|
||||
"scheduler": [
|
||||
"diffusers",
|
||||
"PNDMScheduler"
|
||||
],
|
||||
...,
|
||||
"unet": [
|
||||
"diffusers",
|
||||
"UNet2DConditionModel"
|
||||
],
|
||||
"vae": [
|
||||
"diffusers",
|
||||
"AutoencoderKL"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
We strongly recommend running the pipeline on a GPU because the model consists of roughly 1.4 billion parameters.
|
||||
You can move the generator object to a GPU, just like you would in PyTorch:
|
||||
The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components.
|
||||
Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on GPU.
|
||||
You can move the generator object to GPU, just like you would in PyTorch.
|
||||
|
||||
```python
|
||||
>>> pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Now you can pass a text prompt to the `pipeline` to generate an image, and then access the denoised image. By default, the image output is wrapped in a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.
|
||||
Now you can use the `pipeline` on your text prompt:
|
||||
|
||||
```python
|
||||
>>> image = pipeline("An image of a squirrel in Picasso style").images[0]
|
||||
>>> image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_of_squirrel_painting.png"/>
|
||||
</div>
|
||||
The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
|
||||
|
||||
Save the image by calling `save`:
|
||||
You can save the image by simply calling:
|
||||
|
||||
```python
|
||||
>>> image.save("image_of_squirrel_painting.png")
|
||||
```
|
||||
|
||||
### Local pipeline
|
||||
|
||||
You can also use the pipeline locally. The only difference is you need to download the weights first:
|
||||
**Note**: You can also use the pipeline locally by downloading the weights via:
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
Then load the saved weights into the pipeline:
|
||||
and then loading the saved weights into the pipeline.
|
||||
|
||||
```python
|
||||
>>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
|
||||
```
|
||||
|
||||
Now you can run the pipeline as you would in the section above.
|
||||
Running the pipeline is then identical to the code above as it's the same model architecture.
|
||||
|
||||
### Swapping schedulers
|
||||
```python
|
||||
>>> generator.to("cuda")
|
||||
>>> image = generator("An image of a squirrel in Picasso style").images[0]
|
||||
>>> image.save("image_of_squirrel_painting.png")
|
||||
```
|
||||
|
||||
Different schedulers come with different denoising speeds and quality trade-offs. The best way to find out which one works best for you is to try them out! One of the main features of 🧨 Diffusers is to allow you to easily switch between schedulers. For example, to replace the default [`PNDMScheduler`] with the [`EulerDiscreteScheduler`], load it with the [`~diffusers.ConfigMixin.from_config`] method:
|
||||
Diffusion systems can be used with multiple different [schedulers](./api/schedulers/overview) each with their
|
||||
pros and cons. By default, Stable Diffusion runs with [`PNDMScheduler`], but it's very simple to
|
||||
use a different scheduler. *E.g.* if you would instead like to use the [`EulerDiscreteScheduler`] scheduler,
|
||||
you could use it as follows:
|
||||
|
||||
```py
|
||||
```python
|
||||
>>> from diffusers import EulerDiscreteScheduler
|
||||
|
||||
>>> pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
|
||||
>>> # change scheduler to Euler
|
||||
>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
```
|
||||
|
||||
Try generating an image with the new scheduler and see if you notice a difference!
|
||||
For more in-detail information on how to change between schedulers, please refer to the [Using Schedulers](./using-diffusers/schedulers) guide.
|
||||
|
||||
In the next section, you'll take a closer look at the components - the model and scheduler - that make up the [`DiffusionPipeline`] and learn how to use these components to generate an image of a cat.
|
||||
[Stability AI's](https://stability.ai/) Stable Diffusion model is an impressive image generation model
|
||||
and can do much more than just generating images from text. We have dedicated a whole documentation page,
|
||||
just for Stable Diffusion [here](./conceptual/stable_diffusion).
|
||||
|
||||
## Models
|
||||
If you want to know how to optimize Stable Diffusion to run on less memory, higher inference speeds, on specific hardware, such as Mac, or with [ONNX Runtime](https://onnxruntime.ai/), please have a look at our
|
||||
optimization pages:
|
||||
|
||||
Most models take a noisy sample, and at each timestep it predicts the *noise residual* (other models learn to predict the previous sample directly or the velocity or [`v-prediction`](https://github.com/huggingface/diffusers/blob/5e5ce13e2f89ac45a0066cb3f369462a3cf1d9ef/src/diffusers/schedulers/scheduling_ddim.py#L110)), the difference between a less noisy image and the input image. You can mix and match models to create other diffusion systems.
|
||||
- [Optimized PyTorch on GPU](./optimization/fp16)
|
||||
- [Mac OS with PyTorch](./optimization/mps)
|
||||
- [ONNX](./optimization/onnx)
|
||||
- [OpenVINO](./optimization/open_vino)
|
||||
|
||||
Models are initiated with the [`~ModelMixin.from_pretrained`] method which also locally caches the model weights so it is faster the next time you load the model. For the quicktour, you'll load the [`UNet2DModel`], a basic unconditional image generation model with a checkpoint trained on cat images:
|
||||
If you want to fine-tune or train your diffusion model, please have a look at the [**training section**](./training/overview)
|
||||
|
||||
```py
|
||||
>>> from diffusers import UNet2DModel
|
||||
|
||||
>>> repo_id = "google/ddpm-cat-256"
|
||||
>>> model = UNet2DModel.from_pretrained(repo_id)
|
||||
```
|
||||
|
||||
To access the model parameters, call `model.config`:
|
||||
|
||||
```py
|
||||
>>> model.config
|
||||
```
|
||||
|
||||
The model configuration is a 🧊 frozen 🧊 dictionary, which means those parameters can't be changed after the model is created. This is intentional and ensures that the parameters used to define the model architecture at the start remain the same, while other parameters can still be adjusted during inference.
|
||||
|
||||
Some of the most important parameters are:
|
||||
|
||||
* `sample_size`: the height and width dimension of the input sample.
|
||||
* `in_channels`: the number of input channels of the input sample.
|
||||
* `down_block_types` and `up_block_types`: the type of down- and upsampling blocks used to create the UNet architecture.
|
||||
* `block_out_channels`: the number of output channels of the downsampling blocks; also used in reverse order for the number of input channels of the upsampling blocks.
|
||||
* `layers_per_block`: the number of ResNet blocks present in each UNet block.
|
||||
|
||||
To use the model for inference, create the image shape with random Gaussian noise. It should have a `batch` axis because the model can receive multiple random noises, a `channel` axis corresponding to the number of input channels, and a `sample_size` axis for the height and width of the image:
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
|
||||
>>> torch.manual_seed(0)
|
||||
|
||||
>>> noisy_sample = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
>>> noisy_sample.shape
|
||||
torch.Size([1, 3, 256, 256])
|
||||
```
|
||||
|
||||
For inference, pass the noisy image to the model and a `timestep`. The `timestep` indicates how noisy the input image is, with more noise at the beginning and less at the end. This helps the model determine its position in the diffusion process, whether it is closer to the start or the end. Use the `sample` method to get the model output:
|
||||
|
||||
```py
|
||||
>>> with torch.no_grad():
|
||||
... noisy_residual = model(sample=noisy_sample, timestep=2).sample
|
||||
```
|
||||
|
||||
To generate actual examples though, you'll need a scheduler to guide the denoising process. In the next section, you'll learn how to couple a model with a scheduler.
|
||||
|
||||
## Schedulers
|
||||
|
||||
Schedulers manage going from a noisy sample to a less noisy sample given the model output - in this case, it is the `noisy_residual`.
|
||||
|
||||
<Tip>
|
||||
|
||||
🧨 Diffusers is a toolbox for building diffusion systems. While the [`DiffusionPipeline`] is a convenient way to get started with a pre-built diffusion system, you can also choose your own model and scheduler components separately to build a custom diffusion system.
|
||||
|
||||
</Tip>
|
||||
|
||||
For the quicktour, you'll instantiate the [`DDPMScheduler`] with it's [`~diffusers.ConfigMixin.from_config`] method:
|
||||
|
||||
```py
|
||||
>>> from diffusers import DDPMScheduler
|
||||
|
||||
>>> scheduler = DDPMScheduler.from_config(repo_id)
|
||||
>>> scheduler
|
||||
DDPMScheduler {
|
||||
"_class_name": "DDPMScheduler",
|
||||
"_diffusers_version": "0.13.1",
|
||||
"beta_end": 0.02,
|
||||
"beta_schedule": "linear",
|
||||
"beta_start": 0.0001,
|
||||
"clip_sample": true,
|
||||
"clip_sample_range": 1.0,
|
||||
"num_train_timesteps": 1000,
|
||||
"prediction_type": "epsilon",
|
||||
"trained_betas": null,
|
||||
"variance_type": "fixed_small"
|
||||
}
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Notice how the scheduler is instantiated from a configuration. Unlike a model, a scheduler does not have trainable weights and is parameter-free!
|
||||
|
||||
</Tip>
|
||||
|
||||
Some of the most important parameters are:
|
||||
|
||||
* `num_train_timesteps`: the length of the denoising process or in other words, the number of timesteps required to process random Gaussian noise into a data sample.
|
||||
* `beta_schedule`: the type of noise schedule to use for inference and training.
|
||||
* `beta_start` and `beta_end`: the start and end noise values for the noise schedule.
|
||||
|
||||
To predict a slightly less noisy image, pass the following to the scheduler's [`~diffusers.DDPMScheduler.step`] method: model output, `timestep`, and current `sample`.
|
||||
|
||||
```py
|
||||
>>> less_noisy_sample = scheduler.step(model_output=noisy_residual, timestep=2, sample=noisy_sample).prev_sample
|
||||
>>> less_noisy_sample.shape
|
||||
```
|
||||
|
||||
The `less_noisy_sample` can be passed to the next `timestep` where it'll get even less noisier! Let's bring it all together now and visualize the entire denoising process.
|
||||
|
||||
First, create a function that postprocesses and displays the denoised image as a `PIL.Image`:
|
||||
|
||||
```py
|
||||
>>> import PIL.Image
|
||||
>>> import numpy as np
|
||||
|
||||
|
||||
>>> def display_sample(sample, i):
|
||||
... image_processed = sample.cpu().permute(0, 2, 3, 1)
|
||||
... image_processed = (image_processed + 1.0) * 127.5
|
||||
... image_processed = image_processed.numpy().astype(np.uint8)
|
||||
|
||||
... image_pil = PIL.Image.fromarray(image_processed[0])
|
||||
... display(f"Image at step {i}")
|
||||
... display(image_pil)
|
||||
```
|
||||
|
||||
To speed up the denoising process, move the input and model to a GPU:
|
||||
|
||||
```py
|
||||
>>> model.to("cuda")
|
||||
>>> noisy_sample = noisy_sample.to("cuda")
|
||||
```
|
||||
|
||||
Now create a denoising loop that predicts the residual of the less noisy sample, and computes the less noisy sample with the scheduler:
|
||||
|
||||
```py
|
||||
>>> import tqdm
|
||||
|
||||
>>> sample = noisy_sample
|
||||
|
||||
>>> for i, t in enumerate(tqdm.tqdm(scheduler.timesteps)):
|
||||
... # 1. predict noise residual
|
||||
... with torch.no_grad():
|
||||
... residual = model(sample, t).sample
|
||||
|
||||
... # 2. compute less noisy image and set x_t -> x_t-1
|
||||
... sample = scheduler.step(residual, t, sample).prev_sample
|
||||
|
||||
... # 3. optionally look at image
|
||||
... if (i + 1) % 50 == 0:
|
||||
... display_sample(sample, i + 1)
|
||||
```
|
||||
|
||||
Sit back and watch as a cat is generated from nothing but noise! 😻
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/diffusion-quicktour.png"/>
|
||||
</div>
|
||||
|
||||
## Next steps
|
||||
|
||||
Hopefully you generated some cool images with 🧨 Diffusers in this quicktour! For your next steps, you can:
|
||||
|
||||
* Train or finetune a model to generate your own images in the [training](./tutorials/basic_training) tutorial.
|
||||
* See example official and community [training or finetuning scripts](https://github.com/huggingface/diffusers/tree/main/examples#-diffusers-examples) for a variety of use cases.
|
||||
* Learn more about loading, accessing, changing and comparing schedulers in the [Using different Schedulers](./using-diffusers/schedulers) guide.
|
||||
* Explore prompt engineering, speed and memory optimizations, and tips and tricks for generating higher quality images with the [Stable Diffusion](./stable_diffusion) guide.
|
||||
* Dive deeper into speeding up 🧨 Diffusers with guides on [optimized PyTorch on a GPU](./optimization/fp16), and inference guides for running [Stable Diffusion on Apple Silicon (M1/M2)](./optimization/mps) and [ONNX Runtime](./optimization/onnx).
|
||||
Finally, please be considerate when distributing generated images publicly 🤗.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
@@ -10,67 +10,55 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DreamBooth
|
||||
# DreamBooth fine-tuning example
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like Stable Diffusion given just a few (3-5) images of a subject. It allows the model to generate contextualized images of the subject in different scenes, poses, and views.
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like stable diffusion given just a few (3~5) images of a subject.
|
||||
|
||||

|
||||
<small>Dreambooth examples from the <a href="https://dreambooth.github.io">project's blog.</a></small>
|
||||
_Dreambooth examples from the [project's blog](https://dreambooth.github.io)._
|
||||
|
||||
This guide will show you how to finetune DreamBooth with the [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) model for various GPU sizes, and with Flax. All the training scripts for DreamBooth used in this guide can be found [here](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) if you're interested in digging deeper and seeing how things work.
|
||||
The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) shows how to implement this training procedure on a pre-trained Stable Diffusion model.
|
||||
|
||||
Before running the scripts, make sure you install the library's training dependencies. We also recommend installing 🧨 Diffusers from the `main` GitHub branch:
|
||||
<Tip warning={true}>
|
||||
|
||||
Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects, and go from there.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Training locally
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies. We also recommend to install `diffusers` from the `main` github branch.
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers
|
||||
pip install -U -r diffusers/examples/dreambooth/requirements.txt
|
||||
```
|
||||
|
||||
xFormers is not part of the training requirements, but we recommend you [install](../optimization/xformers) it if you can because it could make your training faster and less memory intensive.
|
||||
xFormers is not part of the training requirements, but [we recommend you install it if you can](../optimization/xformers). It could make your training faster and less memory intensive.
|
||||
|
||||
After all the dependencies have been set up, initialize a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
After all dependencies have been set up you can configure a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
To setup a default 🤗 Accelerate environment without choosing any configurations:
|
||||
In this example we'll use model version `v1-4`, so please visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4) and carefully read the license before proceeding.
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
The command below will download and cache the model weights from the Hub because we use the model's Hub id `CompVis/stable-diffusion-v1-4`. You may also clone the repo locally and use the local path in your system where the checkout was saved.
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
### Dog toy example
|
||||
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
In this example we'll use [these images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) to add a new concept to Stable Diffusion using the Dreambooth process. They will be our training data. Please, download them and place them somewhere in your system.
|
||||
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
## Finetuning
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects to help you choose the appropriate hyperparameters.
|
||||
|
||||
</Tip>
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Let's try DreamBooth with a [few images of a dog](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ); download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path:
|
||||
Then you can launch the training script using:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
```
|
||||
|
||||
Then you can launch the training script (you can find the full training script [here](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)) with the following command:
|
||||
|
||||
```bash
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
@@ -84,44 +72,13 @@ accelerate launch train_dreambooth.py \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
If you have access to TPUs or want to train even faster, you can try out the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_flax.py). The Flax training script doesn't support gradient checkpointing or gradient accumulation, so you'll need a GPU with at least 30GB of memory.
|
||||
|
||||
Before running the script, make sure you have the requirements installed:
|
||||
### Training with a prior-preserving loss
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements.txt
|
||||
```
|
||||
Prior preservation is used to avoid overfitting and language-drift. Please, refer to the paper to learn more about it if you are interested. For prior preservation, we use other images of the same class as part of the training process. The nice thing is that we can generate those images using the Stable Diffusion model itself! The training script will save the generated images to a local path we specify.
|
||||
|
||||
Now you can launch the training script with the following command:
|
||||
According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior preservation. 200-300 works well for most cases.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Finetuning with prior-preserving loss
|
||||
|
||||
Prior preservation is used to avoid overfitting and language-drift (check out the [paper](https://arxiv.org/abs/2208.12242) to learn more if you're interested). For prior preservation, you use other images of the same class as part of the training process. The nice thing is that you can generate those images using the Stable Diffusion model itself! The training script will save the generated images to a local path you specify.
|
||||
|
||||
The author's recommend generating `num_epochs * num_samples` images for prior preservation. In most cases, 200-300 images work well.
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
@@ -145,148 +102,32 @@ accelerate launch train_dreambooth.py \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
### Saving checkpoints while training
|
||||
|
||||
## Finetuning the text encoder and UNet
|
||||
|
||||
The script also allows you to finetune the `text_encoder` along with the `unet`. In our experiments (check out the [Training Stable Diffusion with DreamBooth using 🧨 Diffusers](https://huggingface.co/blog/dreambooth) post for more details), this yields much better results, especially when generating images of faces.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Training the text encoder requires additional memory and it won't fit on a 16GB GPU. You'll need at least 24GB VRAM to use this option.
|
||||
|
||||
</Tip>
|
||||
|
||||
Pass the `--train_text_encoder` argument to the training script to enable finetuning the `text_encoder` and `unet`:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--use_8bit_adam
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</pt>
|
||||
<jax>
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=2e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## Finetuning with LoRA
|
||||
|
||||
You can also use Low-Rank Adaptation of Large Language Models (LoRA), a fine-tuning technique for accelerating training large models, on DreamBooth. For more details, take a look at the [LoRA training](training/lora#dreambooth) guide.
|
||||
|
||||
## Saving checkpoints while training
|
||||
|
||||
It's easy to overfit while training with Dreambooth, so sometimes it's useful to save regular checkpoints during the training process. One of the intermediate checkpoints might actually work better than the final model! Pass the following argument to the training script to enable saving checkpoints:
|
||||
It's easy to overfit while training with Dreambooth, so sometimes it's useful to save regular checkpoints during the process. One of the intermediate checkpoints might work better than the final model! To use this feature you need to pass the following argument to the training script:
|
||||
|
||||
```bash
|
||||
--checkpointing_steps=500
|
||||
```
|
||||
|
||||
This saves the full training state in subfolders of your `output_dir`. Subfolder names begin with the prefix `checkpoint-`, followed by the number of steps performed so far; for example, `checkpoint-1500` would be a checkpoint saved after 1500 training steps.
|
||||
This will save the full training state in subfolders of your `output_dir`. Subfolder names begin with the prefix `checkpoint-`, and then the number of steps performed so far; for example: `checkpoint-1500` would be a checkpoint saved after 1500 training steps.
|
||||
|
||||
### Resume training from a saved checkpoint
|
||||
#### Resuming training from a saved checkpoint
|
||||
|
||||
If you want to resume training from any of the saved checkpoints, you can pass the argument `--resume_from_checkpoint` to the script and specify the name of the checkpoint you want to use. You can also use the special string `"latest"` to resume from the last saved checkpoint (the one with the largest number of steps). For example, the following would resume training from the checkpoint saved after 1500 steps:
|
||||
If you want to resume training from any of the saved checkpoints, you can pass the argument `--resume_from_checkpoint` and then indicate the name of the checkpoint you want to use. You can also use the special string `"latest"` to resume from the last checkpoint saved (i.e., the one with the largest number of steps). For example, the following would resume training from the checkpoint saved after 1500 steps:
|
||||
|
||||
```bash
|
||||
--resume_from_checkpoint="checkpoint-1500"
|
||||
```
|
||||
|
||||
This is a good opportunity to tweak some of your hyperparameters if you wish.
|
||||
This would be a good opportunity to tweak some of your hyperparameters if you wish.
|
||||
|
||||
### Inference from a saved checkpoint
|
||||
#### Performing inference using a saved checkpoint
|
||||
|
||||
Saved checkpoints are stored in a format suitable for resuming training. They not only include the model weights, but also the state of the optimizer, data loaders, and learning rate.
|
||||
Saved checkpoints are stored in a format suitable for resuming training. They not only include the model weights, but also the state of the optimizer, data loaders and learning rate.
|
||||
|
||||
If you have **`"accelerate>=0.16.0"`** installed, use the following code to run
|
||||
inference from an intermediate checkpoint.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, UNet2DConditionModel
|
||||
from transformers import CLIPTextModel
|
||||
import torch
|
||||
|
||||
# Load the pipeline with the same arguments (model, revision) that were used for training
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/unet")
|
||||
|
||||
# if you have trained with `--args.train_text_encoder` make sure to also load the text encoder
|
||||
text_encoder = CLIPTextModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/text_encoder")
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16)
|
||||
pipeline.to("cuda")
|
||||
|
||||
# Perform inference, or save, or push to the hub
|
||||
pipeline.save_pretrained("dreambooth-pipeline")
|
||||
```
|
||||
|
||||
If you have **`"accelerate<0.16.0"`** installed, you need to convert it to an inference pipeline first:
|
||||
You can use a checkpoint for inference, but first you need to convert it to an inference pipeline. This is how you could do it:
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
@@ -315,37 +156,15 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
pipeline.save_pretrained("dreambooth-pipeline")
|
||||
```
|
||||
|
||||
## Optimizations for different GPU sizes
|
||||
### Training on a 16GB GPU
|
||||
|
||||
Depending on your hardware, there are a few different ways to optimize DreamBooth on GPUs from 16GB to just 8GB!
|
||||
|
||||
### xFormers
|
||||
|
||||
[xFormers](https://github.com/facebookresearch/xformers) is a toolbox for optimizing Transformers, and it include a [memory-efficient attention](https://facebookresearch.github.io/xformers/components/ops.html#module-xformers.ops) mechanism that is used in 🧨 Diffusers. You'll need to [install xFormers](./optimization/xformers) and then add the following argument to your training script:
|
||||
|
||||
```bash
|
||||
--enable_xformers_memory_efficient_attention
|
||||
```
|
||||
|
||||
xFormers is not available in Flax.
|
||||
|
||||
### Set gradients to none
|
||||
|
||||
Another way you can lower your memory footprint is to [set the gradients](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html) to `None` instead of zero. However, this may change certain behaviors, so if you run into any issues, try removing this argument. Add the following argument to your training script to set the gradients to `None`:
|
||||
|
||||
```bash
|
||||
--set_grads_to_none
|
||||
```
|
||||
|
||||
### 16GB GPU
|
||||
|
||||
With the help of gradient checkpointing and [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) 8-bit optimizer, it's possible to train DreamBooth on a 16GB GPU. Make sure you have bitsandbytes installed:
|
||||
With the help of gradient checkpointing and the 8-bit optimizer from [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), it's possible to train dreambooth on a 16GB GPU.
|
||||
|
||||
```bash
|
||||
pip install bitsandbytes
|
||||
```
|
||||
|
||||
Then pass the `--use_8bit_adam` option to the training script:
|
||||
Then pass the `--use_8bit_adam` option to the training script.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
@@ -372,18 +191,25 @@ accelerate launch train_dreambooth.py \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### 12GB GPU
|
||||
### Fine-tune the text encoder in addition to the UNet
|
||||
|
||||
To run DreamBooth on a 12GB GPU, you'll need to enable gradient checkpointing, the 8-bit optimizer, xFormers, and set the gradients to `None`:
|
||||
The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our blog](https://huggingface.co/blog/dreambooth) for more details.
|
||||
|
||||
To enable this option, pass the `--train_text_encoder` argument to the training script.
|
||||
|
||||
<Tip>
|
||||
Training the text encoder requires additional memory, so training won't fit on a 16GB GPU. You'll need at least 24GB VRAM to use this option.
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
@@ -392,10 +218,8 @@ accelerate launch train_dreambooth.py \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 --gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--enable_xformers_memory_efficient_attention \
|
||||
--set_grads_to_none \
|
||||
--use_8bit_adam
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
@@ -403,25 +227,19 @@ accelerate launch train_dreambooth.py \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### 8 GB GPU
|
||||
### Training on a 8 GB GPU:
|
||||
|
||||
For 8GB GPUs, you'll need the help of [DeepSpeed](https://www.deepspeed.ai/) to offload some
|
||||
tensors from the VRAM to either the CPU or NVME, enabling training with less GPU memory.
|
||||
Using [DeepSpeed](https://www.deepspeed.ai/) it's even possible to offload some
|
||||
tensors from VRAM to either CPU or NVME, allowing training to proceed with less GPU memory.
|
||||
|
||||
Run the following command to configure your 🤗 Accelerate environment:
|
||||
DeepSpeed needs to be enabled with `accelerate config`. During configuration,
|
||||
answer yes to "Do you want to use DeepSpeed?". Combining DeepSpeed stage 2, fp16
|
||||
mixed precision, and offloading both the model parameters and the optimizer state to CPU, it's
|
||||
possible to train on under 8 GB VRAM. The drawback is that this requires more system RAM (about 25 GB). See [the DeepSpeed documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more configuration options.
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
During configuration, confirm that you want to use DeepSpeed. Now it's possible to train on under 8GB VRAM by combining DeepSpeed stage 2, fp16 mixed precision, and offloading the model parameters and the optimizer state to the CPU. The drawback is that this requires more system RAM, about 25 GB. See [the DeepSpeed documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more configuration options.
|
||||
|
||||
You should also change the default Adam optimizer to DeepSpeed's optimized version of Adam
|
||||
[`deepspeed.ops.adam.DeepSpeedCPUAdam`](https://deepspeed.readthedocs.io/en/latest/optimizers.html#adam-cpu) for a substantial speedup. Enabling `DeepSpeedCPUAdam` requires your system's CUDA toolchain version to be the same as the one installed with PyTorch.
|
||||
|
||||
8-bit optimizers don't seem to be compatible with DeepSpeed at the moment.
|
||||
|
||||
Launch training with the following command:
|
||||
Changing the default Adam optimizer to DeepSpeed's special version of Adam
|
||||
`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup, but enabling
|
||||
it requires the system's CUDA toolchain version to be the same as the one installed with PyTorch. 8-bit optimizers don't seem to be compatible with DeepSpeed at the moment.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
@@ -451,10 +269,7 @@ accelerate launch train_dreambooth.py \
|
||||
|
||||
## Inference
|
||||
|
||||
Once you have trained a model, specify the path to where the model is saved, and use it for inference in the [`StableDiffusionPipeline`]. Make sure your prompts include the special `identifier` used during training (`sks` in the previous examples).
|
||||
|
||||
If you have **`"accelerate>=0.16.0"`** installed, you can use the following code to run
|
||||
inference from an intermediate checkpoint:
|
||||
Once you have trained a model, inference can be done using the `StableDiffusionPipeline`, by simply indicating the path where the model was saved. Make sure that your prompts include the special `identifier` used during training (`sks` in the previous examples).
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
@@ -469,4 +284,4 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
|
||||
You may also run inference from any of the [saved training checkpoints](#inference-from-a-saved-checkpoint).
|
||||
You may also run inference from [any of the saved training checkpoints](#performing-inference-using-a-saved-checkpoint).
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user