Compare commits
30 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4719b8f5f9 | |||
| a53c17b698 | |||
| 41d6ca0058 | |||
| a28a77bfe9 | |||
| fbb8b34716 | |||
| bc2ba004c6 | |||
| 3d7eaf83d7 | |||
| bf406ea886 | |||
| 2fd46405cd | |||
| 43346adc1f | |||
| 6110d7c95f | |||
| 65ef7a0c5c | |||
| 6e68c71503 | |||
| 17528afcba | |||
| 78be400761 | |||
| c803a8f8c0 | |||
| d384265df7 | |||
| 11c125667b | |||
| 69996938cf | |||
| 9ae90593c0 | |||
| 7942bb8dc2 | |||
| aab6de22c3 | |||
| 1dc231d14a | |||
| 84cd9e8d01 | |||
| a8523bffa8 | |||
| 97c8199dbb | |||
| 414d7c4991 | |||
| 8ca179a0a9 | |||
| 71f56c771a | |||
| 6a89a6c93a |
@@ -28,6 +28,5 @@ jobs:
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
python -c "from diffusers import UNet2DConditionModel; print(type(UNet2DConditionModel))"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
name: Run Flax dependency tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_flax_dependencies:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
pip install "flax>=0.4.1"
|
||||
pip install "jaxlib>=0.1.65"
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
pytest tests/others/test_dependencies.py
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples/test_examples.py
|
||||
examples/test_examples.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
name: Run Torch dependency tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_torch_dependencies:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install torch torchvision torchaudio
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
pytest tests/others/test_dependencies.py
|
||||
@@ -72,6 +72,8 @@
|
||||
title: Overview
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: using-diffusers/lcm
|
||||
title: Latent Consistency Models
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/controlnet
|
||||
@@ -200,6 +202,8 @@
|
||||
title: AsymmetricAutoencoderKL
|
||||
- local: api/models/autoencoder_tiny
|
||||
title: Tiny AutoEncoder
|
||||
- local: api/models/consistency_decoder_vae
|
||||
title: ConsistencyDecoderVAE
|
||||
- local: api/models/transformer2d
|
||||
title: Transformer2D
|
||||
- local: api/models/transformer_temporal
|
||||
@@ -344,6 +348,8 @@
|
||||
title: Overview
|
||||
- local: api/schedulers/cm_stochastic_iterative
|
||||
title: CMStochasticIterativeScheduler
|
||||
- local: api/schedulers/consistency_decoder
|
||||
title: ConsistencyDecoderScheduler
|
||||
- local: api/schedulers/ddim_inverse
|
||||
title: DDIMInverseScheduler
|
||||
- local: api/schedulers/ddim
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
# Consistency Decoder
|
||||
|
||||
Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistencydecoder](https://github.com/openai/consistencydecoder).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Inference is only supported for 2 iterations as of now.
|
||||
|
||||
</Tip>
|
||||
|
||||
The pipeline could not have been contributed without the help of [madebyollin](https://github.com/madebyollin) and [mrsteyk](https://github.com/mrsteyk) from [this issue](https://github.com/openai/consistencydecoder/issues/1).
|
||||
|
||||
## ConsistencyDecoderVAE
|
||||
[[autodoc]] ConsistencyDecoderVAE
|
||||
- all
|
||||
- decode
|
||||
@@ -0,0 +1,9 @@
|
||||
# ConsistencyDecoderScheduler
|
||||
|
||||
This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models).
|
||||
|
||||
|
||||
## ConsistencyDecoderScheduler
|
||||
[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler
|
||||
@@ -12,9 +12,9 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Inference with PEFT
|
||||
# Load LoRAs for inference
|
||||
|
||||
There are many adapters trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images. With the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers, it is really easy to load and manage adapters for inference. In this guide, you'll learn how to use different adapters with [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) for inference.
|
||||
There are many adapters (with LoRAs being the most common type) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images. With the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers, it is really easy to load and manage adapters for inference. In this guide, you'll learn how to use different adapters with [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) for inference.
|
||||
|
||||
Throughout this guide, you'll use LoRA as the main adapter technique, so we'll use the terms LoRA and adapter interchangeably. You should have some familiarity with LoRA, and if you don't, we welcome you to check out the [LoRA guide](https://huggingface.co/docs/peft/conceptual_guides/lora).
|
||||
|
||||
@@ -22,9 +22,8 @@ Let's first install all the required libraries.
|
||||
|
||||
```bash
|
||||
!pip install -q transformers accelerate
|
||||
# Will be updated once the stable releases are done.
|
||||
!pip install -q git+https://github.com/huggingface/peft.git
|
||||
!pip install -q git+https://github.com/huggingface/diffusers.git
|
||||
!pip install peft
|
||||
!pip install diffusers
|
||||
```
|
||||
|
||||
Now, let's load a pipeline with a SDXL checkpoint:
|
||||
@@ -165,3 +164,22 @@ list_adapters_component_wise = pipe.get_list_adapters()
|
||||
list_adapters_component_wise
|
||||
{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
|
||||
```
|
||||
|
||||
## Fusing adapters into the model
|
||||
|
||||
You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
|
||||
|
||||
```py
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
# Fuses the LoRAs into the Unet
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
|
||||
# Gets the Unet back to the original state
|
||||
pipe.unfuse_lora()
|
||||
```
|
||||
|
||||
@@ -30,6 +30,7 @@ You can generate images from a prompt in 🤗 Diffusers in two steps:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
|
||||
@@ -42,6 +43,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
image = pipeline(
|
||||
"stained glass of darth vader, backlight, centered composition, masterpiece, photorealistic, 8k"
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
@@ -65,6 +67,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
).to("cuda")
|
||||
generator = torch.Generator("cuda").manual_seed(31)
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### Stable Diffusion XL
|
||||
@@ -80,6 +83,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
).to("cuda")
|
||||
generator = torch.Generator("cuda").manual_seed(31)
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### Kandinsky 2.2
|
||||
@@ -93,15 +97,16 @@ from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16"
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
generator = torch.Generator("cuda").manual_seed(31)
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### ControlNet
|
||||
|
||||
ControlNet are auxiliary models or adapters that are finetuned on top of text-to-image models, such as [Stable Diffusion V1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5). Using ControlNet models in combination with text-to-image models offers diverse options for more explicit control over how to generate an image. With ControlNet's, you add an additional conditioning input image to the model. For example, if you provide an image of a human pose (usually represented as multiple keypoints that are connected into a skeleton) as a conditioning input, the model generates an image that follows the pose of the image. Check out the more in-depth [ControlNet](controlnet) guide to learn more about other conditioning inputs and how to use them.
|
||||
ControlNet models are auxiliary models or adapters that are finetuned on top of text-to-image models, such as [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5). Using ControlNet models in combination with text-to-image models offers diverse options for more explicit control over how to generate an image. With ControlNet, you add an additional conditioning input image to the model. For example, if you provide an image of a human pose (usually represented as multiple keypoints that are connected into a skeleton) as a conditioning input, the model generates an image that follows the pose of the image. Check out the more in-depth [ControlNet](controlnet) guide to learn more about other conditioning inputs and how to use them.
|
||||
|
||||
In this example, let's condition the ControlNet with a human pose estimation image. Load the ControlNet model pretrained on human pose estimations:
|
||||
|
||||
@@ -124,6 +129,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
).to("cuda")
|
||||
generator = torch.Generator("cuda").manual_seed(31)
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=pose_image, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -163,6 +169,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
image = pipeline(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", height=768, width=512
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
@@ -171,7 +178,7 @@ image = pipeline(
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Other models may have different default image sizes depending on the image size's in the training dataset. For example, SDXL's default image size is 1024x1024 and using lower `height` and `width` values may result in lower quality images. Make sure you check the model's API reference first!
|
||||
Other models may have different default image sizes depending on the image sizes in the training dataset. For example, SDXL's default image size is 1024x1024 and using lower `height` and `width` values may result in lower quality images. Make sure you check the model's API reference first!
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -189,6 +196,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
image = pipeline(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", guidance_scale=3.5
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -221,16 +229,17 @@ image = pipeline(
|
||||
prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
|
||||
negative_prompt="ugly, deformed, disfigured, poor details, bad anatomy",
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/text2img-neg-prompt-1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "ugly, deformed, disfigured, poor details, bad anatomy"</figcaption>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/text2img-neg-prompt-2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "astronaut"</figcaption>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative_prompt = "astronaut"</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -252,6 +261,7 @@ image = pipeline(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
## Control image generation
|
||||
@@ -278,14 +288,14 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
image = pipeline(
|
||||
prompt_emebds=prompt_embeds, # generated from Compel
|
||||
prompt_embeds=prompt_embeds, # generated from Compel
|
||||
negative_prompt_embeds=negative_prompt_embeds, # generated from Compel
|
||||
).images[0]
|
||||
```
|
||||
|
||||
### ControlNet
|
||||
|
||||
As you saw in the [ControlNet](#controlnet) section, these models offer a more flexible and accurate way to generate images by incorporating an additional conditioning image input. Each ControlNet model is pretrained on a particular type of conditioning image to generate new images that resemble it. For example, if you take a ControlNet pretrained on depth maps, you can give the model a depth map as a conditioning input and it'll generate an image that preserves the spatial information in it. This is quicker and easier than specifying the depth information in a prompt. You can even combine multiple conditioning inputs with a [MultiControlNet](controlnet#multicontrolnet)!
|
||||
As you saw in the [ControlNet](#controlnet) section, these models offer a more flexible and accurate way to generate images by incorporating an additional conditioning image input. Each ControlNet model is pretrained on a particular type of conditioning image to generate new images that resemble it. For example, if you take a ControlNet model pretrained on depth maps, you can give the model a depth map as a conditioning input and it'll generate an image that preserves the spatial information in it. This is quicker and easier than specifying the depth information in a prompt. You can even combine multiple conditioning inputs with a [MultiControlNet](controlnet#multicontrolnet)!
|
||||
|
||||
There are many types of conditioning inputs you can use, and 🤗 Diffusers supports ControlNet for Stable Diffusion and SDXL models. Take a look at the more comprehensive [ControlNet](controlnet) guide to learn how you can use these models.
|
||||
|
||||
@@ -300,7 +310,7 @@ from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16").to("cuda")
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overheard", fullgraph=True)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
```
|
||||
|
||||
For more tips on how to optimize your code to save memory and speed up inference, read the [Memory and speed](../optimization/fp16) and [Torch 2.0](../optimization/torch2.0) guides.
|
||||
For more tips on how to optimize your code to save memory and speed up inference, read the [Memory and speed](../optimization/fp16) and [Torch 2.0](../optimization/torch2.0) guides.
|
||||
|
||||
@@ -20,12 +20,10 @@ Start by creating an instance of the [`StableDiffusionDepth2ImgPipeline`]:
|
||||
|
||||
```python
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from diffusers import StableDiffusionDepth2ImgPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
|
||||
pipeline = StableDiffusionDepth2ImgPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-depth",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
@@ -36,22 +34,13 @@ Now pass your prompt to the pipeline. You can also pass a `negative_prompt` to p
|
||||
|
||||
```python
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
init_image = Image.open(requests.get(url, stream=True).raw)
|
||||
init_image = load_image(url)
|
||||
prompt = "two tigers"
|
||||
n_prompt = "bad, deformed, ugly, bad anatomy"
|
||||
image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
|
||||
image
|
||||
negative_prompt = "bad, deformed, ugly, bad anatomy"
|
||||
image = pipeline(prompt=prompt, image=init_image, negative_prompt=negative_prompt, strength=0.7).images[0]
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
| Input | Output |
|
||||
|---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/coco-cats.png" width="500"/> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/depth2img-tigers.png" width="500"/> |
|
||||
|
||||
Play around with the Spaces below and see if you notice a difference between generated images with and without a depth map!
|
||||
|
||||
<iframe
|
||||
src="https://radames-stable-diffusion-depth2img.hf.space"
|
||||
frameborder="0"
|
||||
width="850"
|
||||
height="500"
|
||||
></iframe>
|
||||
|
||||
@@ -21,13 +21,15 @@ With 🤗 Diffusers, this is as easy as 1-2-3:
|
||||
1. Load a checkpoint into the [`AutoPipelineForImage2Image`] class; this pipeline automatically handles loading the correct pipeline class based on the checkpoint:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
```
|
||||
|
||||
@@ -48,7 +50,7 @@ init_image = load_image("https://huggingface.co/datasets/huggingface/documentati
|
||||
```py
|
||||
prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
|
||||
image = pipeline(prompt, image=init_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -72,27 +74,25 @@ Stable Diffusion v1.5 is a latent diffusion model initialized from an earlier ch
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, image=init_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -112,27 +112,25 @@ SDXL is a more powerful version of the Stable Diffusion model. It uses a larger
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, image=init_image, strength=0.5).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -154,27 +152,25 @@ The simplest way to use Kandinsky 2.2 is:
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, image=init_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -199,32 +195,29 @@ There are several important parameters you can configure in the pipeline that'll
|
||||
- 📈 a higher `strength` value gives the model more "creativity" to generate an image that's different from the initial image; a `strength` value of 1.0 means the initial image is more or less ignored
|
||||
- 📉 a lower `strength` value means the generated image is more similar to the initial image
|
||||
|
||||
The `strength` and `num_inference_steps` parameter are related because `strength` determines the number of noise steps to add. For example, if the `num_inference_steps` is 50 and `strength` is 0.8, then this means adding 40 (50 * 0.8) steps of noise to the initial image and then denoising for 40 steps to get the newly generated image.
|
||||
The `strength` and `num_inference_steps` parameters are related because `strength` determines the number of noise steps to add. For example, if the `num_inference_steps` is 50 and `strength` is 0.8, then this means adding 40 (50 * 0.8) steps of noise to the initial image and then denoising for 40 steps to get the newly generated image.
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = init_image
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, image=init_image, strength=0.8).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -250,27 +243,25 @@ You can combine `guidance_scale` with `strength` for even more precise control o
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, image=init_image, guidance_scale=8.0).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -294,38 +285,36 @@ A negative prompt conditions the model to *not* include things in an image, and
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
image = pipeline(prompt, negative_prompt=negative_prompt, image=init_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-negative-1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "ugly, deformed, disfigured, poor details, bad anatomy"</figcaption>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-negative-2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "jungle"</figcaption>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative_prompt = "jungle"</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -342,52 +331,54 @@ Start by generating an image with the text-to-image pipeline:
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
from diffusers.utils import make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k").images[0]
|
||||
text2image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k").images[0]
|
||||
text2image
|
||||
```
|
||||
|
||||
Now you can pass this generated image to the image-to-image pipeline:
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=image).images[0]
|
||||
image
|
||||
image2image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=text2image).images[0]
|
||||
make_image_grid([text2image, image2image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
### Image-to-image-to-image
|
||||
|
||||
You can also chain multiple image-to-image pipelines together to create more interesting images. This can be useful for iteratively performing style transfer on an image, generate short GIFs, restore color to an image, or restore missing areas of an image.
|
||||
You can also chain multiple image-to-image pipelines together to create more interesting images. This can be useful for iteratively performing style transfer on an image, generating short GIFs, restoring color to an image, or restoring missing areas of an image.
|
||||
|
||||
Start by generating an image:
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
@@ -404,10 +395,11 @@ It is important to specify `output_type="latent"` in the pipeline to keep all th
|
||||
Pass the latent output from this pipeline to the next pipeline to generate an image in a [comic book art style](https://huggingface.co/ogkalu/Comic-Diffusion):
|
||||
|
||||
```py
|
||||
pipelne = AutoPipelineForImage2Image.from_pretrained(
|
||||
"ogkalu/Comic-Diffusion", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"ogkalu/Comic-Diffusion", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# need to include the token "charliebo artstyle" in the prompt to use this checkpoint
|
||||
@@ -418,14 +410,15 @@ Repeat one more time to generate the final image in a [pixel art style](https://
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"kohbanye/pixel-art-style", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
"kohbanye/pixel-art-style", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# need to include the token "pixelartstyle" in the prompt to use this checkpoint
|
||||
image = pipeline("Astronaut in a jungle, pixelartstyle", image=image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
### Image-to-upscaler-to-super-resolution
|
||||
@@ -436,21 +429,19 @@ Start with an image-to-image pipeline:
|
||||
|
||||
```py
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
@@ -467,7 +458,9 @@ It is important to specify `output_type="latent"` in the pipeline to keep all th
|
||||
Chain it to an upscaler pipeline to increase the image resolution:
|
||||
|
||||
```py
|
||||
upscaler = AutoPipelineForImage2Image.from_pretrained(
|
||||
from diffusers import StableDiffusionLatentUpscalePipeline
|
||||
|
||||
upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
|
||||
"stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
upscaler.enable_model_cpu_offload()
|
||||
@@ -479,14 +472,16 @@ image_2 = upscaler(prompt, image=image_1, output_type="latent").images[0]
|
||||
Finally, chain it to a super-resolution pipeline to further enhance the resolution:
|
||||
|
||||
```py
|
||||
super_res = AutoPipelineForImage2Image.from_pretrained(
|
||||
from diffusers import StableDiffusionUpscalePipeline
|
||||
|
||||
super_res = StableDiffusionUpscalePipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-x4-upscaler", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
super_res.enable_model_cpu_offload()
|
||||
super_res.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image_3 = upscaler(prompt, image=image_2).images[0]
|
||||
image_3
|
||||
image_3 = super_res(prompt, image=image_2).images[0]
|
||||
make_image_grid([init_image, image_3.resize((512, 512))], rows=1, cols=2)
|
||||
```
|
||||
|
||||
## Control image generation
|
||||
@@ -504,13 +499,14 @@ from diffusers import AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline(prompt_emebds=prompt_embeds, # generated from Compel
|
||||
negative_prompt_embeds, # generated from Compel
|
||||
image = pipeline(prompt_embeds=prompt_embeds, # generated from Compel
|
||||
negative_prompt_embeds=negative_prompt_embeds, # generated from Compel
|
||||
image=init_image,
|
||||
).images[0]
|
||||
```
|
||||
@@ -522,19 +518,20 @@ ControlNets provide a more flexible and accurate way to control image generation
|
||||
For example, let's condition an image with a depth map to keep the spatial information in the image.
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = load_image(url)
|
||||
init_image = init_image.resize((958, 960)) # resize to depth image dimensions
|
||||
depth_image = load_image("https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png")
|
||||
make_image_grid([init_image, depth_image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
Load a ControlNet model conditioned on depth maps and the [`AutoPipelineForImage2Image`]:
|
||||
|
||||
```py
|
||||
from diffusers import ControlNetModel, AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
|
||||
@@ -542,6 +539,7 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
```
|
||||
|
||||
@@ -549,8 +547,8 @@ Now generate a new image conditioned on the depth map, initial image, and prompt
|
||||
|
||||
```py
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = pipeline(prompt, image=init_image, control_image=depth_image).images[0]
|
||||
image
|
||||
image_control_net = pipeline(prompt, image=init_image, control_image=depth_image).images[0]
|
||||
make_image_grid([init_image, depth_image, image_control_net], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -575,13 +573,14 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"nitrosocke/elden-ring-diffusion", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
prompt = "elden ring style astronaut in a jungle" # include the token "elden ring style" in the prompt
|
||||
negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"
|
||||
|
||||
image = pipeline(prompt, negative_prompt=negative_prompt, image=init_image, strength=0.45, guidance_scale=10.5).images[0]
|
||||
image
|
||||
image_elden_ring = pipeline(prompt, negative_prompt=negative_prompt, image=image_control_net, strength=0.45, guidance_scale=10.5).images[0]
|
||||
make_image_grid([init_image, depth_image, image_control_net, image_elden_ring], rows=2, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
@@ -597,10 +596,10 @@ Running diffusion models is computationally expensive and intensive, but with a
|
||||
+ pipeline.enable_xformers_memory_efficient_attention()
|
||||
```
|
||||
|
||||
With [`torch.compile`](../optimization/torch2.0#torch.compile), you can boost your inference speed even more by wrapping your UNet with it:
|
||||
With [`torch.compile`](../optimization/torch2.0#torchcompile), you can boost your inference speed even more by wrapping your UNet with it:
|
||||
|
||||
```py
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
```
|
||||
|
||||
To learn more, take a look at the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides.
|
||||
|
||||
@@ -23,12 +23,13 @@ With 🤗 Diffusers, here is how you can do inpainting:
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
```
|
||||
|
||||
@@ -41,8 +42,8 @@ You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu
|
||||
2. Load the base and mask images:
|
||||
|
||||
```py
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
```
|
||||
|
||||
3. Create a prompt to inpaint the image with and pass it to the pipeline with the base and mask images:
|
||||
@@ -51,6 +52,7 @@ mask_image = load_image("https://huggingface.co/datasets/huggingface/documentati
|
||||
prompt = "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k"
|
||||
negative_prompt = "bad anatomy, deformed, ugly, disfigured"
|
||||
image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
@@ -58,6 +60,10 @@ image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_imag
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">base image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint-cat.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
@@ -79,7 +85,7 @@ Upload a base image to inpaint on and use the sketch tool to draw a mask. Once y
|
||||
|
||||
## Popular models
|
||||
|
||||
[Stable Diffusion Inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting), [Stable Diffusion XL (SDXL) Inpainting](https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images.
|
||||
[Stable Diffusion Inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting), [Stable Diffusion XL (SDXL) Inpainting](https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1), and [Kandinsky 2.2 Inpainting](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder-inpaint) are among the most popular models for inpainting. SDXL typically produces higher resolution images than Stable Diffusion v1.5, and Kandinsky 2.2 is also capable of generating high-quality images.
|
||||
|
||||
### Stable Diffusion Inpainting
|
||||
|
||||
@@ -88,21 +94,23 @@ Stable Diffusion Inpainting is a latent diffusion model finetuned on 512x512 ima
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
generator = torch.Generator("cuda").manual_seed(92)
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
### Stable Diffusion XL (SDXL) Inpainting
|
||||
@@ -112,21 +120,23 @@ SDXL is a larger and more powerful version of Stable Diffusion v1.5. This model
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
generator = torch.Generator("cuda").manual_seed(92)
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
### Kandinsky 2.2 Inpainting
|
||||
@@ -136,21 +146,23 @@ The Kandinsky model family is similar to SDXL because it uses two models as well
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
generator = torch.Generator("cuda").manual_seed(92)
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -186,20 +198,22 @@ Image features - like quality and "creativity" - are dependent on pipeline param
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.6).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -229,20 +243,22 @@ You can use `strength` and `guidance_scale` together for more control over how e
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, guidance_scale=2.5).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -267,22 +283,23 @@ A negative prompt assumes the opposite role of a prompt; it guides the model awa
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
negative_prompt = "bad architecture, unstable, poor details, blurry"
|
||||
image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
@@ -302,7 +319,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
device = "cuda"
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
@@ -334,6 +351,7 @@ mask_image_arr[mask_image_arr >= 0.5] = 1
|
||||
unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image + mask_image_arr * repainted_image
|
||||
unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8"))
|
||||
unmasked_unchanged_image.save("force_unmasked_unchanged.png")
|
||||
make_image_grid([init_image, mask_image, repainted_image, unmasked_unchanged_image], rows=2, cols=2)
|
||||
```
|
||||
|
||||
## Chained inpainting pipelines
|
||||
@@ -349,35 +367,37 @@ Start with the text-to-image pipeline to create a castle:
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline("concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k").images[0]
|
||||
text2image = pipeline("concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k").images[0]
|
||||
```
|
||||
|
||||
Load the mask image of the output from above:
|
||||
|
||||
```py
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_text-chain-mask.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_text-chain-mask.png")
|
||||
```
|
||||
|
||||
And let's inpaint the masked area with a waterfall:
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16, variant="fp16"
|
||||
"kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
prompt = "digital painting of a fantasy waterfall, cloudy"
|
||||
image = pipeline(prompt=prompt, image=image, mask_image=mask_image).images[0]
|
||||
image
|
||||
image = pipeline(prompt=prompt, image=text2image, mask_image=mask_image).images[0]
|
||||
make_image_grid([text2image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -391,7 +411,6 @@ image
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
### Inpaint-to-image-to-image
|
||||
|
||||
You can also chain an inpainting pipeline before another pipeline like image-to-image or an upscaler to improve the quality.
|
||||
@@ -401,23 +420,24 @@ Begin by inpainting an image:
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting, AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
image_inpainting = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
|
||||
# resize image to 1024x1024 for SDXL
|
||||
image = image.resize((1024, 1024))
|
||||
image_inpainting = image_inpainting.resize((1024, 1024))
|
||||
```
|
||||
|
||||
Now let's pass the image to another inpainting pipeline with SDXL's refiner model to enhance the image details and quality:
|
||||
@@ -427,9 +447,10 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline(prompt=prompt, image=image, mask_image=mask_image, output_type="latent").images[0]
|
||||
image = pipeline(prompt=prompt, image=image_inpainting, mask_image=mask_image, output_type="latent").images[0]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
@@ -442,9 +463,11 @@ Finally, you can pass this image to an image-to-image pipeline to put the finish
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForImage2Image.from_pipe(pipeline)
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline(prompt=prompt, image=image).images[0]
|
||||
make_image_grid([init_image, mask_image, image_inpainting, image], rows=2, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -477,18 +500,21 @@ Once you've generated the embeddings, pass them to the `prompt_embeds` (and `neg
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipeline(prompt_emebds=prompt_embeds, # generated from Compel
|
||||
negative_prompt_embeds, # generated from Compel
|
||||
image = pipeline(prompt_embeds=prompt_embeds, # generated from Compel
|
||||
negative_prompt_embeds=negative_prompt_embeds, # generated from Compel
|
||||
image=init_image,
|
||||
mask_image=mask_image
|
||||
).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
### ControlNet
|
||||
@@ -501,7 +527,7 @@ For example, let's condition an image with a ControlNet pretrained on inpaint im
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# load ControlNet
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, variant="fp16")
|
||||
@@ -511,11 +537,12 @@ pipeline = StableDiffusionControlNetInpaintPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png").convert("RGB")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png").convert("RGB")
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
# prepare control image
|
||||
def make_inpaint_condition(init_image, mask_image):
|
||||
@@ -536,7 +563,7 @@ Now generate an image from the base, mask and control images. You'll notice feat
|
||||
```py
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image, control_image=control_image).images[0]
|
||||
image
|
||||
make_image_grid([init_image, mask_image, PIL.Image.fromarray(np.uint8(control_image[0][0])).convert('RGB'), image], rows=2, cols=2)
|
||||
```
|
||||
|
||||
You can take this a step further and chain it with an image-to-image pipeline to apply a new [style](https://huggingface.co/nitrosocke/elden-ring-diffusion):
|
||||
@@ -548,13 +575,14 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"nitrosocke/elden-ring-diffusion", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
prompt = "elden ring style castle" # include the token "elden ring style" in the prompt
|
||||
negative_prompt = "bad architecture, deformed, disfigured, poor details"
|
||||
|
||||
image = pipeline(prompt, negative_prompt=negative_prompt, image=image).images[0]
|
||||
image
|
||||
image_elden_ring = pipeline(prompt, negative_prompt=negative_prompt, image=image).images[0]
|
||||
make_image_grid([init_image, mask_image, image, image_elden_ring], rows=2, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -576,17 +604,17 @@ image
|
||||
|
||||
It can be difficult and slow to run diffusion models if you're resource constrained, but it doesn't have to be with a few optimization tricks. One of the biggest (and easiest) optimizations you can enable is switching to memory-efficient attention. If you're using PyTorch 2.0, [scaled-dot product attention](../optimization/torch2.0#scaled-dot-product-attention) is automatically enabled and you don't need to do anything else. For non-PyTorch 2.0 users, you can install and use [xFormers](../optimization/xformers)'s implementation of memory-efficient attention. Both options reduce memory usage and accelerate inference.
|
||||
|
||||
You can also offload the model to the GPU to save even more memory:
|
||||
You can also offload the model to the CPU to save even more memory:
|
||||
|
||||
```diff
|
||||
+ pipeline.enable_xformers_memory_efficient_attention()
|
||||
+ pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
To speed-up your inference code even more, use [`torch_compile`](../optimization/torch2.0#torch.compile). You should wrap `torch.compile` around the most intensive component in the pipeline which is typically the UNet:
|
||||
To speed-up your inference code even more, use [`torch_compile`](../optimization/torch2.0#torchcompile). You should wrap `torch.compile` around the most intensive component in the pipeline which is typically the UNet:
|
||||
|
||||
```py
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
```
|
||||
|
||||
Learn more in the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides.
|
||||
Learn more in the [Reduce memory usage](../optimization/memory) and [Torch 2.0](../optimization/torch2.0) guides.
|
||||
|
||||
@@ -0,0 +1,154 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Performing inference with LCM
|
||||
|
||||
Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings.
|
||||
|
||||
From the [official website](https://latent-consistency-models.github.io/):
|
||||
|
||||
> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
|
||||
|
||||
For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
|
||||
|
||||
This guide shows how to perform inference with LCMs for text-to-image and image-to-image generation tasks. It will also cover performing inference with LoRA checkpoints.
|
||||
|
||||
## Text-to-image
|
||||
|
||||
You'll use the [`StableDiffusionXLPipeline`] here changing the `unet`. The UNet was distilled from the SDXL UNet using the framework introduced in LCM. Another important component is the scheduler: [`LCMScheduler`]. Together with the distilled UNet and the scheduler, LCM enables a fast inference workflow overcoming the slow iterative nature of diffusion models.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, UNet2DConditionModel, LCMScheduler
|
||||
import torch
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
|
||||
|
||||
Some details to keep in mind:
|
||||
|
||||
* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
|
||||
* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases.
|
||||
|
||||
## Image-to-image
|
||||
|
||||
The findings above apply to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs:
|
||||
|
||||
```python
|
||||
from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe = AutoPipelineForImage2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "High altitude snowy mountains"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/snowy_mountains.jpeg"
|
||||
)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
num_inference_steps=4,
|
||||
generator=generator,
|
||||
guidance_scale=8.0,
|
||||
).images[0]
|
||||
```
|
||||

|
||||
|
||||
## LoRA
|
||||
|
||||
It is possible to generalize the LCM framework to use with [LoRA](../training/lora.md). It effectively eliminates the need to conduct expensive fine-tuning runs as LoRA training concerns just a few number of parameters compared to full fine-tuning. During inference, the [`LCMScheduler`] comes to the advantage as it enables very few-steps inference without compromising the quality.
|
||||
|
||||
We recommend to disable `guidance_scale` by setting it 0. The model is trained to follow prompts accurately
|
||||
even without using guidance scale. You can however, still use guidance scale in which case we recommend
|
||||
using values between 1.0 and 2.0.
|
||||
|
||||
### Text-to-image
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
import torch
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id, variant="fp16", torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
pipe.load_lora_weights(lcm_lora_id)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0, # set guidance scale to 0 to disable it
|
||||
).images[0]
|
||||
```
|
||||

|
||||
|
||||
### Image-to-image
|
||||
|
||||
Extending LCM LoRA to image-to-image is possible:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLImg2ImgPipeline, LCMScheduler
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
|
||||
|
||||
pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, variant="fp16", torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
pipe.load_lora_weights(lcm_lora_id)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux"
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lora_lcm.png")
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0, # set guidance scale to 0 to disable it
|
||||
).images[0]
|
||||
```
|
||||

|
||||
@@ -23,16 +23,16 @@ You can use any of the 🧨 Diffusers [checkpoints](https://huggingface.co/model
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Want to train your own unconditional image generation model? Take a look at the training [guide](training/unconditional_training) to learn how to generate your own images.
|
||||
💡 Want to train your own unconditional image generation model? Take a look at the training [guide](../training/unconditional_training) to learn how to generate your own images.
|
||||
|
||||
</Tip>
|
||||
|
||||
In this guide, you'll use [`DiffusionPipeline`] for unconditional image generation with [DDPM](https://arxiv.org/abs/2006.11239):
|
||||
|
||||
```python
|
||||
>>> from diffusers import DiffusionPipeline
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
>>> generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128", use_safetensors=True)
|
||||
generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128", use_safetensors=True)
|
||||
```
|
||||
|
||||
The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components.
|
||||
@@ -40,13 +40,14 @@ Because the model consists of roughly 1.4 billion parameters, we strongly recomm
|
||||
You can move the generator object to a GPU, just like you would in PyTorch:
|
||||
|
||||
```python
|
||||
>>> generator.to("cuda")
|
||||
generator.to("cuda")
|
||||
```
|
||||
|
||||
Now you can use the `generator` to generate an image:
|
||||
|
||||
```python
|
||||
>>> image = generator().images[0]
|
||||
image = generator().images[0]
|
||||
image
|
||||
```
|
||||
|
||||
The output is by default wrapped into a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.
|
||||
@@ -54,7 +55,7 @@ The output is by default wrapped into a [`PIL.Image`](https://pillow.readthedocs
|
||||
You can save the image by calling:
|
||||
|
||||
```python
|
||||
>>> image.save("generated_image.png")
|
||||
image.save("generated_image.png")
|
||||
```
|
||||
|
||||
Try out the Spaces below, and feel free to play around with the inference steps parameter to see how it affects the image quality!
|
||||
@@ -65,5 +66,3 @@ Try out the Spaces below, and feel free to play around with the inference steps
|
||||
width="850"
|
||||
height="500"
|
||||
></iframe>
|
||||
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
# Cache compiled models across invocations of this script.
|
||||
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
|
||||
|
||||
@@ -68,7 +68,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ if is_wandb_available():
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ if is_wandb_available():
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ if is_wandb_available():
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ from diffusers.utils import check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ else:
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ else:
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ if is_wandb_available():
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ if is_wandb_available():
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.23.0.dev0")
|
||||
check_min_version("0.23.0")
|
||||
|
||||
logger = get_logger(__name__, log_level="INFO")
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -112,10 +112,12 @@ _deps = [
|
||||
"numpy",
|
||||
"omegaconf",
|
||||
"parameterized",
|
||||
"peft<=0.6.2",
|
||||
"protobuf>=3.20.3,<4",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
"python>=3.8.0",
|
||||
"ruff==0.0.280",
|
||||
"safetensors>=0.3.1",
|
||||
"sentencepiece>=0.1.91,!=0.1.92",
|
||||
@@ -244,7 +246,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="diffusers",
|
||||
version="0.23.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="0.23.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
description="State-of-the-art diffusion in PyTorch and JAX.",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__version__ = "0.23.0.dev0"
|
||||
__version__ = "0.23.1"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -77,6 +77,7 @@ else:
|
||||
"AsymmetricAutoencoderKL",
|
||||
"AutoencoderKL",
|
||||
"AutoencoderTiny",
|
||||
"ConsistencyDecoderVAE",
|
||||
"ControlNetModel",
|
||||
"ModelMixin",
|
||||
"MotionAdapter",
|
||||
@@ -443,6 +444,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
AsymmetricAutoencoderKL,
|
||||
AutoencoderKL,
|
||||
AutoencoderTiny,
|
||||
ConsistencyDecoderVAE,
|
||||
ControlNetModel,
|
||||
ModelMixin,
|
||||
MotionAdapter,
|
||||
|
||||
@@ -23,21 +23,9 @@ from .utils.versions import require_version, require_version_core
|
||||
# order specific notes:
|
||||
# - tqdm must be checked before tokenizers
|
||||
|
||||
pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
|
||||
if sys.version_info < (3, 7):
|
||||
pkgs_to_check_at_runtime.append("dataclasses")
|
||||
if sys.version_info < (3, 8):
|
||||
pkgs_to_check_at_runtime.append("importlib_metadata")
|
||||
|
||||
pkgs_to_check_at_runtime = "python requests filelock numpy".split()
|
||||
for pkg in pkgs_to_check_at_runtime:
|
||||
if pkg in deps:
|
||||
if pkg == "tokenizers":
|
||||
# must be loaded here, or else tqdm check may fail
|
||||
from .utils import is_tokenizers_available
|
||||
|
||||
if not is_tokenizers_available():
|
||||
continue # not required, check version only if installed
|
||||
|
||||
require_version_core(deps[pkg])
|
||||
else:
|
||||
raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
|
||||
|
||||
@@ -25,10 +25,12 @@ deps = {
|
||||
"numpy": "numpy",
|
||||
"omegaconf": "omegaconf",
|
||||
"parameterized": "parameterized",
|
||||
"peft": "peft<=0.6.2",
|
||||
"protobuf": "protobuf>=3.20.3,<4",
|
||||
"pytest": "pytest",
|
||||
"pytest-timeout": "pytest-timeout",
|
||||
"pytest-xdist": "pytest-xdist",
|
||||
"python": "python>=3.8.0",
|
||||
"ruff": "ruff==0.0.280",
|
||||
"safetensors": "safetensors>=0.3.1",
|
||||
"sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
|
||||
|
||||
@@ -1411,6 +1411,11 @@ class LoraLoaderMixin:
|
||||
filter(lambda x: all(substring not in x for substring in unallowed_substrings), targeted_files)
|
||||
)
|
||||
|
||||
if any(f.endswith(LORA_WEIGHT_NAME) for f in targeted_files):
|
||||
targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME), targeted_files))
|
||||
elif any(f.endswith(LORA_WEIGHT_NAME_SAFE) for f in targeted_files):
|
||||
targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME_SAFE), targeted_files))
|
||||
|
||||
if len(targeted_files) > 1:
|
||||
raise ValueError(
|
||||
f"Provided path contains more than one weights file in the {file_extension} format. Either specify `weight_name` in `load_lora_weights` or make sure there's only one `.safetensors` or `.bin` file in {pretrained_model_name_or_path_or_dict}."
|
||||
@@ -2390,7 +2395,7 @@ class LoraLoaderMixin:
|
||||
def set_adapters_for_text_encoder(
|
||||
self,
|
||||
adapter_names: Union[List[str], str],
|
||||
text_encoder: Optional[PreTrainedModel] = None,
|
||||
text_encoder: Optional["PreTrainedModel"] = None, # noqa: F821
|
||||
text_encoder_weights: List[float] = None,
|
||||
):
|
||||
"""
|
||||
@@ -2429,7 +2434,7 @@ class LoraLoaderMixin:
|
||||
)
|
||||
set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights)
|
||||
|
||||
def disable_lora_for_text_encoder(self, text_encoder: Optional[PreTrainedModel] = None):
|
||||
def disable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
|
||||
"""
|
||||
Disables the LoRA layers for the text encoder.
|
||||
|
||||
@@ -2446,7 +2451,7 @@ class LoraLoaderMixin:
|
||||
raise ValueError("Text Encoder not found.")
|
||||
set_adapter_layers(text_encoder, enabled=False)
|
||||
|
||||
def enable_lora_for_text_encoder(self, text_encoder: Optional[PreTrainedModel] = None):
|
||||
def enable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
|
||||
"""
|
||||
Enables the LoRA layers for the text encoder.
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ if is_torch_available():
|
||||
_import_structure["autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
|
||||
_import_structure["autoencoder_kl"] = ["AutoencoderKL"]
|
||||
_import_structure["autoencoder_tiny"] = ["AutoencoderTiny"]
|
||||
_import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
|
||||
_import_structure["controlnet"] = ["ControlNetModel"]
|
||||
_import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
|
||||
_import_structure["modeling_utils"] = ["ModelMixin"]
|
||||
@@ -50,6 +51,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from .autoencoder_asym_kl import AsymmetricAutoencoderKL
|
||||
from .autoencoder_kl import AutoencoderKL
|
||||
from .autoencoder_tiny import AutoencoderTiny
|
||||
from .consistency_decoder_vae import ConsistencyDecoderVAE
|
||||
from .controlnet import ControlNetModel
|
||||
from .dual_transformer_2d import DualTransformer2DModel
|
||||
from .modeling_utils import ModelMixin
|
||||
|
||||
@@ -287,7 +287,7 @@ class BasicTransformerBlock(nn.Module):
|
||||
else:
|
||||
raise ValueError("Incorrect norm")
|
||||
|
||||
if self.pos_embed is not None and self.use_ada_layer_norm_single is None:
|
||||
if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
|
||||
norm_hidden_states = self.pos_embed(norm_hidden_states)
|
||||
|
||||
attn_output = self.attn2(
|
||||
|
||||
@@ -378,7 +378,7 @@ class Attention(nn.Module):
|
||||
_remove_lora (`bool`, *optional*, defaults to `False`):
|
||||
Set to `True` to remove LoRA layers from the model.
|
||||
"""
|
||||
if hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None:
|
||||
if not USE_PEFT_BACKEND and hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None:
|
||||
deprecate(
|
||||
"set_processor to offload LoRA",
|
||||
"0.26.0",
|
||||
@@ -879,6 +879,9 @@ class AttnAddedKVProcessor:
|
||||
scale: float = 1.0,
|
||||
) -> torch.Tensor:
|
||||
residual = hidden_states
|
||||
|
||||
args = () if USE_PEFT_BACKEND else (scale,)
|
||||
|
||||
hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
|
||||
batch_size, sequence_length, _ = hidden_states.shape
|
||||
|
||||
@@ -891,17 +894,17 @@ class AttnAddedKVProcessor:
|
||||
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states, scale=scale)
|
||||
query = attn.to_q(hidden_states, *args)
|
||||
query = attn.head_to_batch_dim(query)
|
||||
|
||||
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, scale=scale)
|
||||
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, scale=scale)
|
||||
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, *args)
|
||||
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, *args)
|
||||
encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
|
||||
encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
|
||||
|
||||
if not attn.only_cross_attention:
|
||||
key = attn.to_k(hidden_states, scale=scale)
|
||||
value = attn.to_v(hidden_states, scale=scale)
|
||||
key = attn.to_k(hidden_states, *args)
|
||||
value = attn.to_v(hidden_states, *args)
|
||||
key = attn.head_to_batch_dim(key)
|
||||
value = attn.head_to_batch_dim(value)
|
||||
key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
|
||||
@@ -915,7 +918,7 @@ class AttnAddedKVProcessor:
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states, scale=scale)
|
||||
hidden_states = attn.to_out[0](hidden_states, *args)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
@@ -946,6 +949,9 @@ class AttnAddedKVProcessor2_0:
|
||||
scale: float = 1.0,
|
||||
) -> torch.Tensor:
|
||||
residual = hidden_states
|
||||
|
||||
args = () if USE_PEFT_BACKEND else (scale,)
|
||||
|
||||
hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
|
||||
batch_size, sequence_length, _ = hidden_states.shape
|
||||
|
||||
@@ -958,7 +964,7 @@ class AttnAddedKVProcessor2_0:
|
||||
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states, scale=scale)
|
||||
query = attn.to_q(hidden_states, *args)
|
||||
query = attn.head_to_batch_dim(query, out_dim=4)
|
||||
|
||||
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
|
||||
@@ -967,8 +973,8 @@ class AttnAddedKVProcessor2_0:
|
||||
encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
|
||||
|
||||
if not attn.only_cross_attention:
|
||||
key = attn.to_k(hidden_states, scale=scale)
|
||||
value = attn.to_v(hidden_states, scale=scale)
|
||||
key = attn.to_k(hidden_states, *args)
|
||||
value = attn.to_v(hidden_states, *args)
|
||||
key = attn.head_to_batch_dim(key, out_dim=4)
|
||||
value = attn.head_to_batch_dim(value, out_dim=4)
|
||||
key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
|
||||
@@ -985,7 +991,7 @@ class AttnAddedKVProcessor2_0:
|
||||
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states, scale=scale)
|
||||
hidden_states = attn.to_out[0](hidden_states, *args)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
@@ -1177,6 +1183,8 @@ class AttnProcessor2_0:
|
||||
) -> torch.FloatTensor:
|
||||
residual = hidden_states
|
||||
|
||||
args = () if USE_PEFT_BACKEND else (scale,)
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
@@ -1207,12 +1215,8 @@ class AttnProcessor2_0:
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = (
|
||||
attn.to_k(encoder_hidden_states, scale=scale) if not USE_PEFT_BACKEND else attn.to_k(encoder_hidden_states)
|
||||
)
|
||||
value = (
|
||||
attn.to_v(encoder_hidden_states, scale=scale) if not USE_PEFT_BACKEND else attn.to_v(encoder_hidden_states)
|
||||
)
|
||||
key = attn.to_k(encoder_hidden_states, *args)
|
||||
value = attn.to_v(encoder_hidden_states, *args)
|
||||
|
||||
inner_dim = key.shape[-1]
|
||||
head_dim = inner_dim // attn.heads
|
||||
@@ -1232,9 +1236,7 @@ class AttnProcessor2_0:
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
|
||||
# linear proj
|
||||
hidden_states = (
|
||||
attn.to_out[0](hidden_states, scale=scale) if not USE_PEFT_BACKEND else attn.to_out[0](hidden_states)
|
||||
)
|
||||
hidden_states = attn.to_out[0](hidden_states, *args)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
@@ -1361,6 +1363,7 @@ class CustomDiffusionXFormersAttnProcessor(nn.Module):
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
@@ -1433,8 +1436,11 @@ class CustomDiffusionAttnProcessor2_0(nn.Module):
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
if self.train_kv:
|
||||
key = self.to_k_custom_diffusion(encoder_hidden_states)
|
||||
value = self.to_v_custom_diffusion(encoder_hidden_states)
|
||||
key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
|
||||
value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
|
||||
key = key.to(attn.to_q.weight.dtype)
|
||||
value = value.to(attn.to_q.weight.dtype)
|
||||
|
||||
else:
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
|
||||
@@ -138,6 +138,7 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
|
||||
def decode(
|
||||
self,
|
||||
z: torch.FloatTensor,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
image: Optional[torch.FloatTensor] = None,
|
||||
mask: Optional[torch.FloatTensor] = None,
|
||||
return_dict: bool = True,
|
||||
|
||||
@@ -294,7 +294,9 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
|
||||
return DecoderOutput(sample=dec)
|
||||
|
||||
@apply_forward_hook
|
||||
def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
|
||||
def decode(
|
||||
self, z: torch.FloatTensor, return_dict: bool = True, generator=None
|
||||
) -> Union[DecoderOutput, torch.FloatTensor]:
|
||||
"""
|
||||
Decode a batch of images.
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple, Union
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -307,7 +307,9 @@ class AutoencoderTiny(ModelMixin, ConfigMixin):
|
||||
return AutoencoderTinyOutput(latents=output)
|
||||
|
||||
@apply_forward_hook
|
||||
def decode(self, x: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
|
||||
def decode(
|
||||
self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
|
||||
) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
|
||||
output = torch.cat(output)
|
||||
|
||||
@@ -0,0 +1,430 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..schedulers import ConsistencyDecoderScheduler
|
||||
from ..utils import BaseOutput
|
||||
from ..utils.accelerate_utils import apply_forward_hook
|
||||
from ..utils.torch_utils import randn_tensor
|
||||
from .attention_processor import (
|
||||
ADDED_KV_ATTENTION_PROCESSORS,
|
||||
CROSS_ATTENTION_PROCESSORS,
|
||||
AttentionProcessor,
|
||||
AttnAddedKVProcessor,
|
||||
AttnProcessor,
|
||||
)
|
||||
from .modeling_utils import ModelMixin
|
||||
from .unet_2d import UNet2DModel
|
||||
from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConsistencyDecoderVAEOutput(BaseOutput):
|
||||
"""
|
||||
Output of encoding method.
|
||||
|
||||
Args:
|
||||
latent_dist (`DiagonalGaussianDistribution`):
|
||||
Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
|
||||
`DiagonalGaussianDistribution` allows for sampling latents from the distribution.
|
||||
"""
|
||||
|
||||
latent_dist: "DiagonalGaussianDistribution"
|
||||
|
||||
|
||||
class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
|
||||
r"""
|
||||
The consistency decoder used with DALL-E 3.
|
||||
|
||||
Examples:
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers import DiffusionPipeline, ConsistencyDecoderVAE
|
||||
|
||||
>>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=pipe.torch_dtype)
|
||||
>>> pipe = StableDiffusionPipeline.from_pretrained(
|
||||
... "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
|
||||
... ).to("cuda")
|
||||
|
||||
>>> pipe("horse", generator=torch.manual_seed(0)).images
|
||||
```
|
||||
"""
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
scaling_factor=0.18215,
|
||||
latent_channels=4,
|
||||
encoder_act_fn="silu",
|
||||
encoder_block_out_channels=(128, 256, 512, 512),
|
||||
encoder_double_z=True,
|
||||
encoder_down_block_types=(
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
),
|
||||
encoder_in_channels=3,
|
||||
encoder_layers_per_block=2,
|
||||
encoder_norm_num_groups=32,
|
||||
encoder_out_channels=4,
|
||||
decoder_add_attention=False,
|
||||
decoder_block_out_channels=(320, 640, 1024, 1024),
|
||||
decoder_down_block_types=(
|
||||
"ResnetDownsampleBlock2D",
|
||||
"ResnetDownsampleBlock2D",
|
||||
"ResnetDownsampleBlock2D",
|
||||
"ResnetDownsampleBlock2D",
|
||||
),
|
||||
decoder_downsample_padding=1,
|
||||
decoder_in_channels=7,
|
||||
decoder_layers_per_block=3,
|
||||
decoder_norm_eps=1e-05,
|
||||
decoder_norm_num_groups=32,
|
||||
decoder_num_train_timesteps=1024,
|
||||
decoder_out_channels=6,
|
||||
decoder_resnet_time_scale_shift="scale_shift",
|
||||
decoder_time_embedding_type="learned",
|
||||
decoder_up_block_types=(
|
||||
"ResnetUpsampleBlock2D",
|
||||
"ResnetUpsampleBlock2D",
|
||||
"ResnetUpsampleBlock2D",
|
||||
"ResnetUpsampleBlock2D",
|
||||
),
|
||||
):
|
||||
super().__init__()
|
||||
self.encoder = Encoder(
|
||||
act_fn=encoder_act_fn,
|
||||
block_out_channels=encoder_block_out_channels,
|
||||
double_z=encoder_double_z,
|
||||
down_block_types=encoder_down_block_types,
|
||||
in_channels=encoder_in_channels,
|
||||
layers_per_block=encoder_layers_per_block,
|
||||
norm_num_groups=encoder_norm_num_groups,
|
||||
out_channels=encoder_out_channels,
|
||||
)
|
||||
|
||||
self.decoder_unet = UNet2DModel(
|
||||
add_attention=decoder_add_attention,
|
||||
block_out_channels=decoder_block_out_channels,
|
||||
down_block_types=decoder_down_block_types,
|
||||
downsample_padding=decoder_downsample_padding,
|
||||
in_channels=decoder_in_channels,
|
||||
layers_per_block=decoder_layers_per_block,
|
||||
norm_eps=decoder_norm_eps,
|
||||
norm_num_groups=decoder_norm_num_groups,
|
||||
num_train_timesteps=decoder_num_train_timesteps,
|
||||
out_channels=decoder_out_channels,
|
||||
resnet_time_scale_shift=decoder_resnet_time_scale_shift,
|
||||
time_embedding_type=decoder_time_embedding_type,
|
||||
up_block_types=decoder_up_block_types,
|
||||
)
|
||||
self.decoder_scheduler = ConsistencyDecoderScheduler()
|
||||
self.register_to_config(block_out_channels=encoder_block_out_channels)
|
||||
self.register_buffer(
|
||||
"means",
|
||||
torch.tensor([0.38862467, 0.02253063, 0.07381133, -0.0171294])[None, :, None, None],
|
||||
persistent=False,
|
||||
)
|
||||
self.register_buffer(
|
||||
"stds", torch.tensor([0.9654121, 1.0440036, 0.76147926, 0.77022034])[None, :, None, None], persistent=False
|
||||
)
|
||||
|
||||
self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
|
||||
|
||||
self.use_slicing = False
|
||||
self.use_tiling = False
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_tiling
|
||||
def enable_tiling(self, use_tiling: bool = True):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.use_tiling = use_tiling
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_tiling
|
||||
def disable_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.enable_tiling(False)
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_slicing
|
||||
def enable_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.use_slicing = True
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_slicing
|
||||
def disable_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.use_slicing = False
|
||||
|
||||
@property
|
||||
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
|
||||
def attn_processors(self) -> Dict[str, AttentionProcessor]:
|
||||
r"""
|
||||
Returns:
|
||||
`dict` of attention processors: A dictionary containing all attention processors used in the model with
|
||||
indexed by its weight name.
|
||||
"""
|
||||
# set recursively
|
||||
processors = {}
|
||||
|
||||
def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
|
||||
if hasattr(module, "get_processor"):
|
||||
processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
|
||||
|
||||
for sub_name, child in module.named_children():
|
||||
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
|
||||
|
||||
return processors
|
||||
|
||||
for name, module in self.named_children():
|
||||
fn_recursive_add_processors(name, module, processors)
|
||||
|
||||
return processors
|
||||
|
||||
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
|
||||
def set_attn_processor(
|
||||
self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
|
||||
):
|
||||
r"""
|
||||
Sets the attention processor to use to compute attention.
|
||||
|
||||
Parameters:
|
||||
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
|
||||
The instantiated processor class or a dictionary of processor classes that will be set as the processor
|
||||
for **all** `Attention` layers.
|
||||
|
||||
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
|
||||
processor. This is strongly recommended when setting trainable attention processors.
|
||||
|
||||
"""
|
||||
count = len(self.attn_processors.keys())
|
||||
|
||||
if isinstance(processor, dict) and len(processor) != count:
|
||||
raise ValueError(
|
||||
f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
|
||||
f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
|
||||
)
|
||||
|
||||
def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
|
||||
if hasattr(module, "set_processor"):
|
||||
if not isinstance(processor, dict):
|
||||
module.set_processor(processor, _remove_lora=_remove_lora)
|
||||
else:
|
||||
module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
|
||||
|
||||
for sub_name, child in module.named_children():
|
||||
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
|
||||
|
||||
for name, module in self.named_children():
|
||||
fn_recursive_attn_processor(name, module, processor)
|
||||
|
||||
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
|
||||
def set_default_attn_processor(self):
|
||||
"""
|
||||
Disables custom attention processors and sets the default attention implementation.
|
||||
"""
|
||||
if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
|
||||
processor = AttnAddedKVProcessor()
|
||||
elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
|
||||
processor = AttnProcessor()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
|
||||
)
|
||||
|
||||
self.set_attn_processor(processor, _remove_lora=True)
|
||||
|
||||
@apply_forward_hook
|
||||
def encode(
|
||||
self, x: torch.FloatTensor, return_dict: bool = True
|
||||
) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]:
|
||||
"""
|
||||
Encode a batch of images into latents.
|
||||
|
||||
Args:
|
||||
x (`torch.FloatTensor`): Input batch of images.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
The latent representations of the encoded images. If `return_dict` is True, a
|
||||
[`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
|
||||
is returned.
|
||||
"""
|
||||
if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
|
||||
return self.tiled_encode(x, return_dict=return_dict)
|
||||
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
|
||||
h = torch.cat(encoded_slices)
|
||||
else:
|
||||
h = self.encoder(x)
|
||||
|
||||
moments = self.quant_conv(h)
|
||||
posterior = DiagonalGaussianDistribution(moments)
|
||||
|
||||
if not return_dict:
|
||||
return (posterior,)
|
||||
|
||||
return ConsistencyDecoderVAEOutput(latent_dist=posterior)
|
||||
|
||||
@apply_forward_hook
|
||||
def decode(
|
||||
self,
|
||||
z: torch.FloatTensor,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
num_inference_steps=2,
|
||||
) -> Union[DecoderOutput, torch.FloatTensor]:
|
||||
z = (z * self.config.scaling_factor - self.means) / self.stds
|
||||
|
||||
scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
|
||||
z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)
|
||||
|
||||
batch_size, _, height, width = z.shape
|
||||
|
||||
self.decoder_scheduler.set_timesteps(num_inference_steps, device=self.device)
|
||||
|
||||
x_t = self.decoder_scheduler.init_noise_sigma * randn_tensor(
|
||||
(batch_size, 3, height, width), generator=generator, dtype=z.dtype, device=z.device
|
||||
)
|
||||
|
||||
for t in self.decoder_scheduler.timesteps:
|
||||
model_input = torch.concat([self.decoder_scheduler.scale_model_input(x_t, t), z], dim=1)
|
||||
model_output = self.decoder_unet(model_input, t).sample[:, :3, :, :]
|
||||
prev_sample = self.decoder_scheduler.step(model_output, t, x_t, generator).prev_sample
|
||||
x_t = prev_sample
|
||||
|
||||
x_0 = x_t
|
||||
|
||||
if not return_dict:
|
||||
return (x_0,)
|
||||
|
||||
return DecoderOutput(sample=x_0)
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_v
|
||||
def blend_v(self, a, b, blend_extent):
|
||||
blend_extent = min(a.shape[2], b.shape[2], blend_extent)
|
||||
for y in range(blend_extent):
|
||||
b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
|
||||
return b
|
||||
|
||||
# Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_h
|
||||
def blend_h(self, a, b, blend_extent):
|
||||
blend_extent = min(a.shape[3], b.shape[3], blend_extent)
|
||||
for x in range(blend_extent):
|
||||
b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
|
||||
return b
|
||||
|
||||
def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
|
||||
r"""Encode a batch of images using a tiled encoder.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
|
||||
steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
|
||||
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
|
||||
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
|
||||
output, but they should be much less noticeable.
|
||||
|
||||
Args:
|
||||
x (`torch.FloatTensor`): Input batch of images.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
|
||||
plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
|
||||
If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
|
||||
otherwise a plain `tuple` is returned.
|
||||
"""
|
||||
overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
|
||||
blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
|
||||
row_limit = self.tile_latent_min_size - blend_extent
|
||||
|
||||
# Split the image into 512x512 tiles and encode them separately.
|
||||
rows = []
|
||||
for i in range(0, x.shape[2], overlap_size):
|
||||
row = []
|
||||
for j in range(0, x.shape[3], overlap_size):
|
||||
tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
|
||||
tile = self.encoder(tile)
|
||||
tile = self.quant_conv(tile)
|
||||
row.append(tile)
|
||||
rows.append(row)
|
||||
result_rows = []
|
||||
for i, row in enumerate(rows):
|
||||
result_row = []
|
||||
for j, tile in enumerate(row):
|
||||
# blend the above tile and the left tile
|
||||
# to the current tile and add the current tile to the result row
|
||||
if i > 0:
|
||||
tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
|
||||
if j > 0:
|
||||
tile = self.blend_h(row[j - 1], tile, blend_extent)
|
||||
result_row.append(tile[:, :, :row_limit, :row_limit])
|
||||
result_rows.append(torch.cat(result_row, dim=3))
|
||||
|
||||
moments = torch.cat(result_rows, dim=2)
|
||||
posterior = DiagonalGaussianDistribution(moments)
|
||||
|
||||
if not return_dict:
|
||||
return (posterior,)
|
||||
|
||||
return ConsistencyDecoderVAEOutput(latent_dist=posterior)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
sample_posterior: bool = False,
|
||||
return_dict: bool = True,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
) -> Union[DecoderOutput, torch.FloatTensor]:
|
||||
r"""
|
||||
Args:
|
||||
sample (`torch.FloatTensor`): Input sample.
|
||||
sample_posterior (`bool`, *optional*, defaults to `False`):
|
||||
Whether to sample from the posterior.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
|
||||
"""
|
||||
x = sample
|
||||
posterior = self.encode(x).latent_dist
|
||||
if sample_posterior:
|
||||
z = posterior.sample(generator=generator)
|
||||
else:
|
||||
z = posterior.mode()
|
||||
dec = self.decode(z, generator=generator).sample
|
||||
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
|
||||
return DecoderOutput(sample=dec)
|
||||
@@ -778,16 +778,22 @@ class Conv1dBlock(nn.Module):
|
||||
out_channels (`int`): Number of output channels.
|
||||
kernel_size (`int` or `tuple`): Size of the convolving kernel.
|
||||
n_groups (`int`, default `8`): Number of groups to separate the channels into.
|
||||
activation (`str`, defaults `mish`): Name of the activation function.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, inp_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], n_groups: int = 8
|
||||
self,
|
||||
inp_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: Union[int, Tuple[int, int]],
|
||||
n_groups: int = 8,
|
||||
activation: str = "mish",
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.conv1d = nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
|
||||
self.group_norm = nn.GroupNorm(n_groups, out_channels)
|
||||
self.mish = nn.Mish()
|
||||
self.mish = get_activation(activation)
|
||||
|
||||
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
||||
intermediate_repr = self.conv1d(inputs)
|
||||
@@ -808,16 +814,22 @@ class ResidualTemporalBlock1D(nn.Module):
|
||||
out_channels (`int`): Number of output channels.
|
||||
embed_dim (`int`): Embedding dimension.
|
||||
kernel_size (`int` or `tuple`): Size of the convolving kernel.
|
||||
activation (`str`, defaults `mish`): It is possible to choose the right activation function.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, inp_channels: int, out_channels: int, embed_dim: int, kernel_size: Union[int, Tuple[int, int]] = 5
|
||||
self,
|
||||
inp_channels: int,
|
||||
out_channels: int,
|
||||
embed_dim: int,
|
||||
kernel_size: Union[int, Tuple[int, int]] = 5,
|
||||
activation: str = "mish",
|
||||
):
|
||||
super().__init__()
|
||||
self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
|
||||
self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
|
||||
|
||||
self.time_emb_act = nn.Mish()
|
||||
self.time_emb_act = get_activation(activation)
|
||||
self.time_emb = nn.Linear(embed_dim, out_channels)
|
||||
|
||||
self.residual_conv = (
|
||||
|
||||
@@ -339,6 +339,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
|
||||
elif self.is_input_vectorized:
|
||||
hidden_states = self.latent_image_embedding(hidden_states)
|
||||
elif self.is_input_patches:
|
||||
height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
|
||||
hidden_states = self.pos_embed(hidden_states)
|
||||
|
||||
if self.adaln_single is not None:
|
||||
@@ -425,7 +426,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
|
||||
hidden_states = hidden_states.squeeze(1)
|
||||
|
||||
# unpatchify
|
||||
height = width = int(hidden_states.shape[1] ** 0.5)
|
||||
if self.adaln_single is None:
|
||||
height = width = int(hidden_states.shape[1] ** 0.5)
|
||||
hidden_states = hidden_states.reshape(
|
||||
shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
|
||||
)
|
||||
|
||||
@@ -117,6 +117,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
|
||||
add_attention: bool = True,
|
||||
class_embed_type: Optional[str] = None,
|
||||
num_class_embeds: Optional[int] = None,
|
||||
num_train_timesteps: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -144,6 +145,9 @@ class UNet2DModel(ModelMixin, ConfigMixin):
|
||||
elif time_embedding_type == "positional":
|
||||
self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
|
||||
timestep_input_dim = block_out_channels[0]
|
||||
elif time_embedding_type == "learned":
|
||||
self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
|
||||
timestep_input_dim = block_out_channels[0]
|
||||
|
||||
self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
|
||||
|
||||
|
||||
@@ -162,8 +162,8 @@ class VQModel(ModelMixin, ConfigMixin):
|
||||
If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
|
||||
is returned.
|
||||
"""
|
||||
x = sample
|
||||
h = self.encode(x).latents
|
||||
|
||||
h = self.encode(sample).latents
|
||||
dec = self.decode(h).sample
|
||||
|
||||
if not return_dict:
|
||||
|
||||
@@ -588,6 +588,34 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -605,7 +633,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -804,6 +832,14 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 6.5 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 7. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
@@ -818,6 +854,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -852,7 +889,9 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -646,6 +646,34 @@ class AltDiffusionImg2ImgPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -659,7 +687,7 @@ class AltDiffusionImg2ImgPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -849,6 +877,14 @@ class AltDiffusionImg2ImgPipeline(
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 7.5 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 8. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
@@ -863,6 +899,7 @@ class AltDiffusionImg2ImgPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -893,7 +930,9 @@ class AltDiffusionImg2ImgPipeline(
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -498,7 +498,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
prompt: Union[str, List[str]] = None,
|
||||
num_frames: Optional[int] = 16,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
|
||||
@@ -6,7 +6,9 @@ from ...utils import (
|
||||
)
|
||||
|
||||
|
||||
_import_structure = {"pipeline_consistency_models": ["ConsistencyModelPipeline"]}
|
||||
_import_structure = {
|
||||
"pipeline_consistency_models": ["ConsistencyModelPipeline"],
|
||||
}
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from .pipeline_consistency_models import ConsistencyModelPipeline
|
||||
|
||||
@@ -1,3 +1,17 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -1058,7 +1058,9 @@ class StableDiffusionControlNetPipeline(
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -1138,7 +1138,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -1405,7 +1405,9 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -1109,8 +1109,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
|
||||
nsfw_detected = None
|
||||
watermark_detected = None
|
||||
|
||||
if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
|
||||
self.unet_offload_hook.offload()
|
||||
else:
|
||||
# 10. Post-processing
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
@@ -1119,9 +1117,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
|
||||
# 11. Run safety checker
|
||||
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
|
||||
# Offload last model to CPU
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.final_offload_hook.offload()
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, nsfw_detected, watermark_detected)
|
||||
|
||||
@@ -388,6 +388,8 @@ class KandinskyPipeline(DiffusionPipeline):
|
||||
# post-processing
|
||||
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type not in ["pt", "np", "pil"]:
|
||||
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
|
||||
|
||||
|
||||
@@ -321,6 +321,9 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
||||
callback_steps=callback_steps,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@@ -558,6 +561,9 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
||||
callback_steps=callback_steps,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@@ -593,7 +599,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
||||
"""
|
||||
|
||||
_load_connected_pipes = True
|
||||
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
|
||||
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movq"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -802,4 +808,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
||||
callback_steps=callback_steps,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
return outputs
|
||||
|
||||
@@ -481,6 +481,8 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
||||
# 7. post-processing
|
||||
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type not in ["pt", "np", "pil"]:
|
||||
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
|
||||
|
||||
|
||||
@@ -616,6 +616,8 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
||||
# post-processing
|
||||
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type not in ["pt", "np", "pil"]:
|
||||
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
|
||||
|
||||
|
||||
@@ -527,7 +527,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
||||
if negative_prompt is None:
|
||||
zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
|
||||
|
||||
self.maybe_free_model_hooks
|
||||
self.maybe_free_model_hooks()
|
||||
else:
|
||||
image_embeddings, zero_embeds = image_embeddings.chunk(2)
|
||||
|
||||
|
||||
@@ -326,6 +326,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
||||
callback_on_step_end=callback_on_step_end,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
)
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@@ -572,6 +574,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
||||
callback_on_step_end=callback_on_step_end,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
return outputs
|
||||
|
||||
|
||||
@@ -842,4 +846,6 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
return outputs
|
||||
|
||||
@@ -531,14 +531,10 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
||||
# if negative prompt has been defined, we retrieve split the image embedding into two
|
||||
if negative_prompt is None:
|
||||
zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
|
||||
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.final_offload_hook.offload()
|
||||
else:
|
||||
image_embeddings, zero_embeds = image_embeddings.chunk(2)
|
||||
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.prior_hook.offload()
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type not in ["pt", "np"]:
|
||||
raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
|
||||
|
||||
@@ -545,12 +545,10 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
||||
# if negative prompt has been defined, we retrieve split the image embedding into two
|
||||
if negative_prompt is None:
|
||||
zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.final_offload_hook.offload()
|
||||
else:
|
||||
image_embeddings, zero_embeds = image_embeddings.chunk(2)
|
||||
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
||||
self.prior_hook.offload()
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type not in ["pt", "np"]:
|
||||
raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
|
||||
|
||||
@@ -1,19 +1,40 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import (
|
||||
DIFFUSERS_SLOW_IMPORT,
|
||||
OptionalDependencyNotAvailable,
|
||||
_LazyModule,
|
||||
get_objects_from_module,
|
||||
is_torch_available,
|
||||
is_transformers_available,
|
||||
)
|
||||
|
||||
|
||||
_import_structure = {
|
||||
"pipeline_latent_consistency_img2img": ["LatentConsistencyModelImg2ImgPipeline"],
|
||||
"pipeline_latent_consistency_text2img": ["LatentConsistencyModelPipeline"],
|
||||
}
|
||||
_dummy_objects = {}
|
||||
_import_structure = {}
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
|
||||
from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
|
||||
try:
|
||||
if not (is_transformers_available() and is_torch_available()):
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
||||
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_latent_consistency_img2img"] = ["LatentConsistencyModelImg2ImgPipeline"]
|
||||
_import_structure["pipeline_latent_consistency_text2img"] = ["LatentConsistencyModelPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
if not (is_transformers_available() and is_torch_available()):
|
||||
raise OptionalDependencyNotAvailable()
|
||||
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
|
||||
from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
@@ -24,3 +45,6 @@ else:
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
)
|
||||
|
||||
for name, value in _dummy_objects.items():
|
||||
setattr(sys.modules[__name__], name, value)
|
||||
|
||||
+2
-2
@@ -60,7 +60,7 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> import torch
|
||||
>>> import PIL
|
||||
|
||||
>>> pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
|
||||
>>> pipe = AutoPipelineForImage2Image.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
|
||||
>>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
|
||||
>>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
|
||||
|
||||
@@ -738,7 +738,7 @@ class LatentConsistencyModelImg2ImgPipeline(
|
||||
if original_inference_steps is not None
|
||||
else self.scheduler.config.original_inference_steps
|
||||
)
|
||||
latent_timestep = torch.tensor(int(strength * original_inference_steps))
|
||||
latent_timestep = timesteps[:1]
|
||||
latents = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
|
||||
)
|
||||
|
||||
@@ -158,9 +158,9 @@ def is_safetensors_compatible(filenames, variant=None, passed_components=None) -
|
||||
continue
|
||||
|
||||
if extension == ".bin":
|
||||
pt_filenames.append(filename)
|
||||
pt_filenames.append(os.path.normpath(filename))
|
||||
elif extension == ".safetensors":
|
||||
sf_filenames.add(filename)
|
||||
sf_filenames.add(os.path.normpath(filename))
|
||||
|
||||
for filename in pt_filenames:
|
||||
# filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extention = '.bam'
|
||||
@@ -172,9 +172,8 @@ def is_safetensors_compatible(filenames, variant=None, passed_components=None) -
|
||||
else:
|
||||
filename = filename
|
||||
|
||||
expected_sf_filename = os.path.join(path, filename)
|
||||
expected_sf_filename = os.path.normpath(os.path.join(path, filename))
|
||||
expected_sf_filename = f"{expected_sf_filename}.safetensors"
|
||||
|
||||
if expected_sf_filename not in sf_filenames:
|
||||
logger.warning(f"{expected_sf_filename} not found")
|
||||
return False
|
||||
@@ -1774,7 +1773,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
)
|
||||
):
|
||||
raise EnvironmentError(
|
||||
f"Could not found the necessary `safetensors` weights in {model_filenames} (variant={variant})"
|
||||
f"Could not find the necessary `safetensors` weights in {model_filenames} (variant={variant})"
|
||||
)
|
||||
if from_flax:
|
||||
ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]
|
||||
|
||||
@@ -1 +1,48 @@
|
||||
from .pipeline_pixart_alpha import PixArtAlphaPipeline
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import (
|
||||
DIFFUSERS_SLOW_IMPORT,
|
||||
OptionalDependencyNotAvailable,
|
||||
_LazyModule,
|
||||
get_objects_from_module,
|
||||
is_torch_available,
|
||||
is_transformers_available,
|
||||
)
|
||||
|
||||
|
||||
_dummy_objects = {}
|
||||
_import_structure = {}
|
||||
|
||||
|
||||
try:
|
||||
if not (is_transformers_available() and is_torch_available()):
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
||||
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_pixart_alpha"] = ["PixArtAlphaPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
if not (is_transformers_available() and is_torch_available()):
|
||||
raise OptionalDependencyNotAvailable()
|
||||
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_pixart_alpha import PixArtAlphaPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = _LazyModule(
|
||||
__name__,
|
||||
globals()["__file__"],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
)
|
||||
|
||||
for name, value in _dummy_objects.items():
|
||||
setattr(sys.modules[__name__], name, value)
|
||||
|
||||
@@ -156,6 +156,8 @@ class PixArtAlphaPipeline(DiffusionPipeline):
|
||||
mask_feature: (bool, defaults to `True`):
|
||||
If `True`, the function will mask the text embeddings.
|
||||
"""
|
||||
embeds_initially_provided = prompt_embeds is not None and negative_prompt_embeds is not None
|
||||
|
||||
if device is None:
|
||||
device = self._execution_device
|
||||
|
||||
@@ -253,7 +255,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
|
||||
negative_prompt_embeds = None
|
||||
|
||||
# Perform additional masking.
|
||||
if mask_feature:
|
||||
if mask_feature and not embeds_initially_provided:
|
||||
prompt_embeds = prompt_embeds.unsqueeze(1)
|
||||
masked_prompt_embeds, keep_indices = self.mask_text_embeddings(prompt_embeds, prompt_embeds_attention_mask)
|
||||
masked_prompt_embeds = masked_prompt_embeds.squeeze(1)
|
||||
|
||||
@@ -918,6 +918,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
|
||||
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
@@ -576,6 +576,35 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -593,7 +622,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -790,6 +819,14 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 6.5 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 7. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
@@ -804,6 +841,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -838,7 +876,9 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
+1
@@ -1027,6 +1027,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
|
||||
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
@@ -846,6 +846,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
|
||||
image = latents
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
@@ -439,6 +439,8 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
|
||||
@@ -640,6 +640,35 @@ class StableDiffusionImg2ImgPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -653,7 +682,7 @@ class StableDiffusionImg2ImgPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -841,6 +870,14 @@ class StableDiffusionImg2ImgPipeline(
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 7.5 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 8. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
@@ -855,6 +892,7 @@ class StableDiffusionImg2ImgPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -885,7 +923,9 @@ class StableDiffusionImg2ImgPipeline(
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if not output_type == "latent":
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
||||
0
|
||||
]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -765,6 +765,35 @@ class StableDiffusionInpaintPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -778,7 +807,7 @@ class StableDiffusionInpaintPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -1087,6 +1116,14 @@ class StableDiffusionInpaintPipeline(
|
||||
# 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 9.5 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 10. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
@@ -1106,6 +1143,7 @@ class StableDiffusionInpaintPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -1159,7 +1197,9 @@ class StableDiffusionInpaintPipeline(
|
||||
init_image = self._encode_vae_image(init_image, generator=generator)
|
||||
mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
|
||||
condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
|
||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0]
|
||||
image = self.vae.decode(
|
||||
latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
|
||||
)[0]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
@@ -511,6 +511,8 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, FromSingleFileMixi
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
|
||||
@@ -802,6 +802,8 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
|
||||
@@ -741,6 +741,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin)
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
|
||||
@@ -206,17 +206,15 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
prior_text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
|
||||
|
||||
prompt_embeds = prior_text_encoder_output.text_embeds
|
||||
prior_text_encoder_hidden_states = prior_text_encoder_output.last_hidden_state
|
||||
text_enc_hid_states = prior_text_encoder_output.last_hidden_state
|
||||
|
||||
else:
|
||||
batch_size = text_model_output[0].shape[0]
|
||||
prompt_embeds, prior_text_encoder_hidden_states = text_model_output[0], text_model_output[1]
|
||||
prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
|
||||
text_mask = text_attention_mask
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.repeat_interleave(
|
||||
num_images_per_prompt, dim=0
|
||||
)
|
||||
text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
@@ -235,9 +233,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
)
|
||||
|
||||
negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
|
||||
uncond_prior_text_encoder_hidden_states = (
|
||||
negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
|
||||
)
|
||||
uncond_text_enc_hid_states = negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
|
||||
@@ -245,11 +241,9 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
|
||||
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
|
||||
|
||||
seq_len = uncond_prior_text_encoder_hidden_states.shape[1]
|
||||
uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.repeat(
|
||||
1, num_images_per_prompt, 1
|
||||
)
|
||||
uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.view(
|
||||
seq_len = uncond_text_enc_hid_states.shape[1]
|
||||
uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
|
||||
batch_size * num_images_per_prompt, seq_len, -1
|
||||
)
|
||||
uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
@@ -260,13 +254,11 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
prior_text_encoder_hidden_states = torch.cat(
|
||||
[uncond_prior_text_encoder_hidden_states, prior_text_encoder_hidden_states]
|
||||
)
|
||||
text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
|
||||
|
||||
text_mask = torch.cat([uncond_text_mask, text_mask])
|
||||
|
||||
return prompt_embeds, prior_text_encoder_hidden_states, text_mask
|
||||
return prompt_embeds, text_enc_hid_states, text_mask
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
||||
def _encode_prompt(
|
||||
|
||||
@@ -636,6 +636,35 @@ class StableDiffusionXLPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -653,7 +682,7 @@ class StableDiffusionXLPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -989,6 +1018,14 @@ class StableDiffusionXLPipeline(
|
||||
num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
|
||||
timesteps = timesteps[:num_inference_steps]
|
||||
|
||||
# 9. Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
@@ -1003,6 +1040,7 @@ class StableDiffusionXLPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
|
||||
+40
-1
@@ -763,6 +763,35 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -780,7 +809,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -1156,6 +1185,15 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
)
|
||||
num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
|
||||
timesteps = timesteps[:num_inference_steps]
|
||||
|
||||
# 9.2 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
@@ -1170,6 +1208,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
|
||||
+39
-1
@@ -982,6 +982,35 @@ class StableDiffusionXLInpaintPipeline(
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
||||
|
||||
Args:
|
||||
timesteps (`torch.Tensor`):
|
||||
generate embedding vectors at these timesteps
|
||||
embedding_dim (`int`, *optional*, defaults to 512):
|
||||
dimension of the embeddings to generate
|
||||
dtype:
|
||||
data type of the generated embeddings
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
||||
"""
|
||||
assert len(w.shape) == 1
|
||||
w = w * 1000.0
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
||||
emb = w.to(dtype)[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1))
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -999,7 +1028,7 @@ class StableDiffusionXLInpaintPipeline(
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
@property
|
||||
def cross_attention_kwargs(self):
|
||||
@@ -1464,6 +1493,14 @@ class StableDiffusionXLInpaintPipeline(
|
||||
num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
|
||||
timesteps = timesteps[:num_inference_steps]
|
||||
|
||||
# 11.1 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
@@ -1482,6 +1519,7 @@ class StableDiffusionXLInpaintPipeline(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
|
||||
@@ -156,15 +156,15 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
text_encoder_output = self.text_encoder(text_input_ids.to(device))
|
||||
|
||||
prompt_embeds = text_encoder_output.text_embeds
|
||||
text_encoder_hidden_states = text_encoder_output.last_hidden_state
|
||||
text_enc_hid_states = text_encoder_output.last_hidden_state
|
||||
|
||||
else:
|
||||
batch_size = text_model_output[0].shape[0]
|
||||
prompt_embeds, text_encoder_hidden_states = text_model_output[0], text_model_output[1]
|
||||
prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
|
||||
text_mask = text_attention_mask
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
@@ -181,7 +181,7 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
|
||||
|
||||
negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
|
||||
uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
|
||||
uncond_text_enc_hid_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
|
||||
@@ -189,9 +189,9 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
|
||||
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
|
||||
|
||||
seq_len = uncond_text_encoder_hidden_states.shape[1]
|
||||
uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
|
||||
seq_len = uncond_text_enc_hid_states.shape[1]
|
||||
uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
|
||||
batch_size * num_images_per_prompt, seq_len, -1
|
||||
)
|
||||
uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
@@ -202,11 +202,11 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
|
||||
text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
|
||||
|
||||
text_mask = torch.cat([uncond_text_mask, text_mask])
|
||||
|
||||
return prompt_embeds, text_encoder_hidden_states, text_mask
|
||||
return prompt_embeds, text_enc_hid_states, text_mask
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
@@ -293,7 +293,7 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
|
||||
do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
|
||||
|
||||
prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
|
||||
prompt_embeds, text_enc_hid_states, text_mask = self._encode_prompt(
|
||||
prompt, device, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
|
||||
)
|
||||
|
||||
@@ -321,7 +321,7 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
latent_model_input,
|
||||
timestep=t,
|
||||
proj_embedding=prompt_embeds,
|
||||
encoder_hidden_states=text_encoder_hidden_states,
|
||||
encoder_hidden_states=text_enc_hid_states,
|
||||
attention_mask=text_mask,
|
||||
).predicted_image_embedding
|
||||
|
||||
@@ -352,10 +352,10 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
|
||||
# decoder
|
||||
|
||||
text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
|
||||
text_enc_hid_states, additive_clip_time_embeddings = self.text_proj(
|
||||
image_embeddings=image_embeddings,
|
||||
prompt_embeds=prompt_embeds,
|
||||
text_encoder_hidden_states=text_encoder_hidden_states,
|
||||
text_encoder_hidden_states=text_enc_hid_states,
|
||||
do_classifier_free_guidance=do_classifier_free_guidance,
|
||||
)
|
||||
|
||||
@@ -377,7 +377,7 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
|
||||
decoder_latents = self.prepare_latents(
|
||||
(batch_size, num_channels_latents, height, width),
|
||||
text_encoder_hidden_states.dtype,
|
||||
text_enc_hid_states.dtype,
|
||||
device,
|
||||
generator,
|
||||
decoder_latents,
|
||||
@@ -391,7 +391,7 @@ class UnCLIPPipeline(DiffusionPipeline):
|
||||
noise_pred = self.decoder(
|
||||
sample=latent_model_input,
|
||||
timestep=t,
|
||||
encoder_hidden_states=text_encoder_hidden_states,
|
||||
encoder_hidden_states=text_enc_hid_states,
|
||||
class_labels=additive_clip_time_embeddings,
|
||||
attention_mask=decoder_text_mask,
|
||||
).sample
|
||||
|
||||
@@ -1494,7 +1494,6 @@ class ResnetBlockFlat(nn.Module):
|
||||
return output_tensor
|
||||
|
||||
|
||||
# Copied from diffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
|
||||
class DownBlockFlat(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1583,7 +1582,6 @@ class DownBlockFlat(nn.Module):
|
||||
return hidden_states, output_states
|
||||
|
||||
|
||||
# Copied from diffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
|
||||
class CrossAttnDownBlockFlat(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -38,6 +38,7 @@ except OptionalDependencyNotAvailable:
|
||||
_dummy_modules.update(get_objects_from_module(dummy_pt_objects))
|
||||
|
||||
else:
|
||||
_import_structure["scheduling_consistency_decoder"] = ["ConsistencyDecoderScheduler"]
|
||||
_import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
|
||||
_import_structure["scheduling_ddim"] = ["DDIMScheduler"]
|
||||
_import_structure["scheduling_ddim_inverse"] = ["DDIMInverseScheduler"]
|
||||
@@ -128,6 +129,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
except OptionalDependencyNotAvailable:
|
||||
from ..utils.dummy_pt_objects import * # noqa F403
|
||||
else:
|
||||
from .scheduling_consistency_decoder import ConsistencyDecoderScheduler
|
||||
from .scheduling_consistency_models import CMStochasticIterativeScheduler
|
||||
from .scheduling_ddim import DDIMScheduler
|
||||
from .scheduling_ddim_inverse import DDIMInverseScheduler
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..utils import BaseOutput
|
||||
from ..utils.torch_utils import randn_tensor
|
||||
from .scheduling_utils import SchedulerMixin
|
||||
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
def alpha_bar_fn(t):
|
||||
return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
|
||||
|
||||
elif alpha_transform_type == "exp":
|
||||
|
||||
def alpha_bar_fn(t):
|
||||
return math.exp(t * -12.0)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
|
||||
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
t1 = i / num_diffusion_timesteps
|
||||
t2 = (i + 1) / num_diffusion_timesteps
|
||||
betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
|
||||
return torch.tensor(betas, dtype=torch.float32)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConsistencyDecoderSchedulerOutput(BaseOutput):
|
||||
"""
|
||||
Output class for the scheduler's `step` function.
|
||||
|
||||
Args:
|
||||
prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
|
||||
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
|
||||
denoising loop.
|
||||
"""
|
||||
|
||||
prev_sample: torch.FloatTensor
|
||||
|
||||
|
||||
class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin):
|
||||
order = 1
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
num_train_timesteps: int = 1024,
|
||||
sigma_data: float = 0.5,
|
||||
):
|
||||
betas = betas_for_alpha_bar(num_train_timesteps)
|
||||
|
||||
alphas = 1.0 - betas
|
||||
alphas_cumprod = torch.cumprod(alphas, dim=0)
|
||||
|
||||
self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
|
||||
self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
|
||||
|
||||
sigmas = torch.sqrt(1.0 / alphas_cumprod - 1)
|
||||
|
||||
sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
|
||||
|
||||
self.c_skip = sqrt_recip_alphas_cumprod * sigma_data**2 / (sigmas**2 + sigma_data**2)
|
||||
self.c_out = sigmas * sigma_data / (sigmas**2 + sigma_data**2) ** 0.5
|
||||
self.c_in = sqrt_recip_alphas_cumprod / (sigmas**2 + sigma_data**2) ** 0.5
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Union[str, torch.device] = None,
|
||||
):
|
||||
if num_inference_steps != 2:
|
||||
raise ValueError("Currently more than 2 inference steps are not supported.")
|
||||
|
||||
self.timesteps = torch.tensor([1008, 512], dtype=torch.long, device=device)
|
||||
self.sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(device)
|
||||
self.sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(device)
|
||||
self.c_skip = self.c_skip.to(device)
|
||||
self.c_out = self.c_out.to(device)
|
||||
self.c_in = self.c_in.to(device)
|
||||
|
||||
@property
|
||||
def init_noise_sigma(self):
|
||||
return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]]
|
||||
|
||||
def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
|
||||
"""
|
||||
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
|
||||
current timestep.
|
||||
|
||||
Args:
|
||||
sample (`torch.FloatTensor`):
|
||||
The input sample.
|
||||
timestep (`int`, *optional*):
|
||||
The current timestep in the diffusion chain.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`:
|
||||
A scaled input sample.
|
||||
"""
|
||||
return sample * self.c_in[timestep]
|
||||
|
||||
def step(
|
||||
self,
|
||||
model_output: torch.FloatTensor,
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
sample: torch.FloatTensor,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]:
|
||||
"""
|
||||
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
||||
process from the learned model outputs (most often the predicted noise).
|
||||
|
||||
Args:
|
||||
model_output (`torch.FloatTensor`):
|
||||
The direct output from the learned diffusion model.
|
||||
timestep (`float`):
|
||||
The current timestep in the diffusion chain.
|
||||
sample (`torch.FloatTensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a
|
||||
[`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`:
|
||||
If return_dict is `True`,
|
||||
[`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] is returned, otherwise
|
||||
a tuple is returned where the first element is the sample tensor.
|
||||
"""
|
||||
x_0 = self.c_out[timestep] * model_output + self.c_skip[timestep] * sample
|
||||
|
||||
timestep_idx = torch.where(self.timesteps == timestep)[0]
|
||||
|
||||
if timestep_idx == len(self.timesteps) - 1:
|
||||
prev_sample = x_0
|
||||
else:
|
||||
noise = randn_tensor(x_0.shape, generator=generator, dtype=x_0.dtype, device=x_0.device)
|
||||
prev_sample = (
|
||||
self.sqrt_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * x_0
|
||||
+ self.sqrt_one_minus_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * noise
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
return (prev_sample,)
|
||||
|
||||
return ConsistencyDecoderSchedulerOutput(prev_sample=prev_sample)
|
||||
@@ -182,6 +182,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
timestep_spacing (`str`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
timestep_scaling (`float`, defaults to 10.0):
|
||||
The factor the timesteps will be multiplied by when calculating the consistency model boundary conditions
|
||||
`c_skip` and `c_out`. Increasing this will decrease the approximation error (although the approximation
|
||||
error at the default of `10.0` is already pretty small).
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
dark samples instead of limiting it to samples with medium brightness. Loosely related to
|
||||
@@ -208,6 +212,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: str = "leading",
|
||||
timestep_scaling: float = 10.0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -380,12 +385,12 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
self._step_index = None
|
||||
|
||||
def get_scalings_for_boundary_condition_discrete(self, t):
|
||||
def get_scalings_for_boundary_condition_discrete(self, timestep):
|
||||
self.sigma_data = 0.5 # Default: 0.5
|
||||
scaled_timestep = timestep * self.config.timestep_scaling
|
||||
|
||||
# By dividing 0.1: This is almost a delta function at t=0.
|
||||
c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
|
||||
c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
|
||||
c_skip = self.sigma_data**2 / (scaled_timestep**2 + self.sigma_data**2)
|
||||
c_out = scaled_timestep / (scaled_timestep**2 + self.sigma_data**2) ** 0.5
|
||||
return c_skip, c_out
|
||||
|
||||
def step(
|
||||
@@ -466,9 +471,12 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
denoised = c_out * predicted_original_sample + c_skip * sample
|
||||
|
||||
# 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
|
||||
# Noise is not used for one-step sampling.
|
||||
if len(self.timesteps) > 1:
|
||||
noise = randn_tensor(model_output.shape, generator=generator, device=model_output.device)
|
||||
# Noise is not used on the final timestep of the timestep schedule.
|
||||
# This also means that noise is not used for one-step sampling.
|
||||
if self.step_index != self.num_inference_steps - 1:
|
||||
noise = randn_tensor(
|
||||
model_output.shape, generator=generator, device=model_output.device, dtype=denoised.dtype
|
||||
)
|
||||
prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
|
||||
else:
|
||||
prev_sample = denoised
|
||||
|
||||
@@ -18,6 +18,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE, hf_cache_home
|
||||
from packaging import version
|
||||
|
||||
from .import_utils import is_peft_available, is_transformers_available
|
||||
from ..dependency_versions_check import dep_version_check
|
||||
|
||||
|
||||
default_cache_path = HUGGINGFACE_HUB_CACHE
|
||||
@@ -50,3 +51,6 @@ _required_transformers_version = is_transformers_available() and version.parse(
|
||||
) > version.parse(MIN_TRANSFORMERS_VERSION)
|
||||
|
||||
USE_PEFT_BACKEND = _required_peft_version and _required_transformers_version
|
||||
|
||||
if USE_PEFT_BACKEND:
|
||||
dep_version_check("peft")
|
||||
|
||||
@@ -47,6 +47,21 @@ class AutoencoderTiny(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class ConsistencyDecoderVAE(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class ControlNetModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Utilities for working with package versions
|
||||
"""
|
||||
|
||||
import importlib.metadata
|
||||
import operator
|
||||
import re
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from packaging import version
|
||||
|
||||
|
||||
ops = {
|
||||
"<": operator.lt,
|
||||
"<=": operator.le,
|
||||
"==": operator.eq,
|
||||
"!=": operator.ne,
|
||||
">=": operator.ge,
|
||||
">": operator.gt,
|
||||
}
|
||||
|
||||
|
||||
def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
|
||||
if got_ver is None or want_ver is None:
|
||||
raise ValueError(
|
||||
f"Unable to compare versions for {requirement}: need={want_ver} found={got_ver}. This is unusual. Consider"
|
||||
f" reinstalling {pkg}."
|
||||
)
|
||||
if not ops[op](version.parse(got_ver), version.parse(want_ver)):
|
||||
raise ImportError(
|
||||
f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
|
||||
)
|
||||
|
||||
|
||||
def require_version(requirement: str, hint: Optional[str] = None) -> None:
|
||||
"""
|
||||
Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
|
||||
|
||||
The installed module version comes from the *site-packages* dir via *importlib.metadata*.
|
||||
|
||||
Args:
|
||||
requirement (`str`): pip style definition, e.g., "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
|
||||
hint (`str`, *optional*): what suggestion to print in case of requirements not being met
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
require_version("pandas>1.1.2")
|
||||
require_version("numpy>1.18.5", "this is important to have for whatever reason")
|
||||
```"""
|
||||
|
||||
hint = f"\n{hint}" if hint is not None else ""
|
||||
|
||||
# non-versioned check
|
||||
if re.match(r"^[\w_\-\d]+$", requirement):
|
||||
pkg, op, want_ver = requirement, None, None
|
||||
else:
|
||||
match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
|
||||
if not match:
|
||||
raise ValueError(
|
||||
"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but"
|
||||
f" got {requirement}"
|
||||
)
|
||||
pkg, want_full = match[0]
|
||||
want_range = want_full.split(",") # there could be multiple requirements
|
||||
wanted = {}
|
||||
for w in want_range:
|
||||
match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
|
||||
if not match:
|
||||
raise ValueError(
|
||||
"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23,"
|
||||
f" but got {requirement}"
|
||||
)
|
||||
op, want_ver = match[0]
|
||||
wanted[op] = want_ver
|
||||
if op not in ops:
|
||||
raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
|
||||
|
||||
# special case
|
||||
if pkg == "python":
|
||||
got_ver = ".".join([str(x) for x in sys.version_info[:3]])
|
||||
for op, want_ver in wanted.items():
|
||||
_compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
|
||||
return
|
||||
|
||||
# check if any version is installed
|
||||
try:
|
||||
got_ver = importlib.metadata.version(pkg)
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
raise importlib.metadata.PackageNotFoundError(
|
||||
f"The '{requirement}' distribution was not found and is required by this application. {hint}"
|
||||
)
|
||||
|
||||
# check that the right version is installed if version number or a range was provided
|
||||
if want_ver is not None:
|
||||
for op, want_ver in wanted.items():
|
||||
_compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
|
||||
|
||||
|
||||
def require_version_core(requirement):
|
||||
"""require_version wrapper which emits a core-specific hint on failure"""
|
||||
hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main"
|
||||
return require_version(requirement, hint)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user