Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7ef1df4a6f | |||
| b979ee94b8 | |||
| e59f69c8df | |||
| 8770d24815 |
@@ -79,7 +79,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Pipeline CUDA Test
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -152,7 +152,7 @@ jobs:
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -209,7 +209,7 @@ jobs:
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
@@ -264,7 +264,7 @@ jobs:
|
||||
|
||||
- name: Run Nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
name: Fast GPU Tests on main
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
# Duplicate workflow to push_tests.yml that is meant to run on release/patch branches as a final check
|
||||
# Creating a duplicate workflow here is simpler than adding complex path/branch parsing logic to push_tests.yml
|
||||
# Needs to be updated if push_tests.yml updated
|
||||
name: (Release) Fast GPU Tests on main
|
||||
name: Release Fast GPU Tests on main
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -34,7 +34,7 @@ from utils import ( # noqa: E402
|
||||
|
||||
|
||||
RESOLUTION_MAPPING = {
|
||||
"Lykon/DreamShaper": (512, 512),
|
||||
"runwayml/stable-diffusion-v1-5": (512, 512),
|
||||
"lllyasviel/sd-controlnet-canny": (512, 512),
|
||||
"diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
|
||||
"TencentARC/t2iadapter_canny_sd14v1": (512, 512),
|
||||
@@ -268,7 +268,7 @@ class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
|
||||
class ControlNetBenchmark(TextToImageBenchmark):
|
||||
pipeline_class = StableDiffusionControlNetPipeline
|
||||
aux_network_class = ControlNetModel
|
||||
root_ckpt = "Lykon/DreamShaper"
|
||||
root_ckpt = "runwayml/stable-diffusion-v1-5"
|
||||
|
||||
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
|
||||
image = load_image(url).convert("RGB")
|
||||
@@ -311,7 +311,7 @@ class ControlNetSDXLBenchmark(ControlNetBenchmark):
|
||||
class T2IAdapterBenchmark(ControlNetBenchmark):
|
||||
pipeline_class = StableDiffusionAdapterPipeline
|
||||
aux_network_class = T2IAdapter
|
||||
root_ckpt = "Lykon/DreamShaper"
|
||||
root_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
|
||||
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
|
||||
image = load_image(url).convert("L")
|
||||
|
||||
@@ -7,8 +7,7 @@ from base_classes import IPAdapterTextToImageBenchmark # noqa: E402
|
||||
|
||||
|
||||
IP_ADAPTER_CKPTS = {
|
||||
# because original SD v1.5 has been taken down.
|
||||
"Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
|
||||
"runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
|
||||
"stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
|
||||
}
|
||||
|
||||
@@ -18,7 +17,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="rstabilityai/stable-diffusion-xl-base-1.0",
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=list(IP_ADAPTER_CKPTS.keys()),
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
|
||||
@@ -11,9 +11,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="Lykon/DreamShaper",
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=[
|
||||
"Lykon/DreamShaper",
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"stabilityai/stable-diffusion-2-1",
|
||||
"stabilityai/stable-diffusion-xl-refiner-1.0",
|
||||
"stabilityai/sdxl-turbo",
|
||||
|
||||
@@ -11,9 +11,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="Lykon/DreamShaper",
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=[
|
||||
"Lykon/DreamShaper",
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"stabilityai/stable-diffusion-2-1",
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
],
|
||||
|
||||
@@ -7,7 +7,7 @@ from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark # noqa
|
||||
|
||||
|
||||
ALL_T2I_CKPTS = [
|
||||
"Lykon/DreamShaper",
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"segmind/SSD-1B",
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
"kandinsky-community/kandinsky-2-2-decoder",
|
||||
@@ -21,7 +21,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="Lykon/DreamShaper",
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=ALL_T2I_CKPTS,
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
|
||||
@@ -161,8 +161,6 @@
|
||||
title: DeepCache
|
||||
- local: optimization/tgate
|
||||
title: TGATE
|
||||
- local: optimization/xdit
|
||||
title: xDiT
|
||||
- sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
@@ -228,8 +226,6 @@
|
||||
- sections:
|
||||
- local: api/models/controlnet
|
||||
title: ControlNetModel
|
||||
- local: api/models/controlnet_flux
|
||||
title: FluxControlNetModel
|
||||
- local: api/models/controlnet_hunyuandit
|
||||
title: HunyuanDiT2DControlNetModel
|
||||
- local: api/models/controlnet_sd3
|
||||
@@ -324,8 +320,6 @@
|
||||
title: Consistency Models
|
||||
- local: api/pipelines/controlnet
|
||||
title: ControlNet
|
||||
- local: api/pipelines/controlnet_flux
|
||||
title: ControlNet with Flux.1
|
||||
- local: api/pipelines/controlnet_hunyuandit
|
||||
title: ControlNet with Hunyuan-DiT
|
||||
- local: api/pipelines/controlnet_sd3
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# FluxControlNetModel
|
||||
|
||||
FluxControlNetModel is an implementation of ControlNet for Flux.1.
|
||||
|
||||
The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
|
||||
|
||||
## Loading from the original format
|
||||
|
||||
By default the [`FluxControlNetModel`] should be loaded with [`~ModelMixin.from_pretrained`].
|
||||
|
||||
```py
|
||||
from diffusers import FluxControlNetPipeline
|
||||
from diffusers.models import FluxControlNetModel, FluxMultiControlNetModel
|
||||
|
||||
controlnet = FluxControlNetModel.from_pretrained("InstantX/FLUX.1-dev-Controlnet-Canny")
|
||||
pipe = FluxControlNetPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet)
|
||||
|
||||
controlnet = FluxControlNetModel.from_pretrained("InstantX/FLUX.1-dev-Controlnet-Canny")
|
||||
controlnet = FluxMultiControlNetModel([controlnet])
|
||||
pipe = FluxControlNetPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", controlnet=controlnet)
|
||||
```
|
||||
|
||||
## FluxControlNetModel
|
||||
|
||||
[[autodoc]] FluxControlNetModel
|
||||
|
||||
## FluxControlNetOutput
|
||||
|
||||
[[autodoc]] models.controlnet_flux.FluxControlNetOutput
|
||||
@@ -29,7 +29,6 @@ The abstract of the paper is the following:
|
||||
| [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
|
||||
| [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
|
||||
| [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |
|
||||
| [AnimateDiffVideoToVideoControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py) | *Video-to-Video Generation with AnimateDiff using ControlNet* |
|
||||
|
||||
## Available checkpoints
|
||||
|
||||
@@ -519,97 +518,6 @@ Here are some sample outputs:
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
### AnimateDiffVideoToVideoControlNetPipeline
|
||||
|
||||
AnimateDiff can be used together with ControlNets to enhance video-to-video generation by allowing for precise control over the output. ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala, and allows you to condition Stable Diffusion with an additional control image to ensure that the spatial information is preserved throughout the video.
|
||||
|
||||
This pipeline allows you to condition your generation both on the original video and on a sequence of control images.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from PIL import Image
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from controlnet_aux.processor import OpenposeDetector
|
||||
from diffusers import AnimateDiffVideoToVideoControlNetPipeline
|
||||
from diffusers.utils import export_to_gif, load_video
|
||||
from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
|
||||
|
||||
# Load the ControlNet
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
|
||||
# Load the motion adapter
|
||||
motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
|
||||
# Load SD 1.5 based finetuned model
|
||||
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
|
||||
pipe = AnimateDiffVideoToVideoControlNetPipeline.from_pretrained(
|
||||
"SG161222/Realistic_Vision_V5.1_noVAE",
|
||||
motion_adapter=motion_adapter,
|
||||
controlnet=controlnet,
|
||||
vae=vae,
|
||||
).to(device="cuda", dtype=torch.float16)
|
||||
|
||||
# Enable LCM to speed up inference
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
||||
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
||||
pipe.set_adapters(["lcm-lora"], [0.8])
|
||||
|
||||
video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif")
|
||||
video = [frame.convert("RGB") for frame in video]
|
||||
|
||||
prompt = "astronaut in space, dancing"
|
||||
negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
|
||||
|
||||
# Create controlnet preprocessor
|
||||
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
|
||||
|
||||
# Preprocess controlnet images
|
||||
conditioning_frames = []
|
||||
for frame in tqdm(video):
|
||||
conditioning_frames.append(open_pose(frame))
|
||||
|
||||
strength = 0.8
|
||||
with torch.inference_mode():
|
||||
video = pipe(
|
||||
video=video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
num_inference_steps=10,
|
||||
guidance_scale=2.0,
|
||||
controlnet_conditioning_scale=0.75,
|
||||
conditioning_frames=conditioning_frames,
|
||||
strength=strength,
|
||||
generator=torch.Generator().manual_seed(42),
|
||||
).frames[0]
|
||||
|
||||
video = [frame.resize(conditioning_frames[0].size) for frame in video]
|
||||
export_to_gif(video, f"animatediff_vid2vid_controlnet.gif", fps=8)
|
||||
```
|
||||
|
||||
Here are some sample outputs:
|
||||
|
||||
<table align="center">
|
||||
<tr>
|
||||
<th align="center">Source Video</th>
|
||||
<th align="center">Output Video</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">
|
||||
anime girl, dancing
|
||||
<br />
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif" alt="anime girl, dancing" />
|
||||
</td>
|
||||
<td align="center">
|
||||
astronaut in space, dancing
|
||||
<br/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_vid2vid_controlnet.gif" alt="astronaut in space, dancing" />
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
**The lights and composition were transferred from the Source Video.**
|
||||
|
||||
## Using Motion LoRAs
|
||||
|
||||
Motion LoRAs are a collection of LoRAs that work with the `guoyww/animatediff-motion-adapter-v1-5-2` checkpoint. These LoRAs are responsible for adding specific types of motion to the animations.
|
||||
@@ -958,12 +866,6 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## AnimateDiffVideoToVideoControlNetPipeline
|
||||
|
||||
[[autodoc]] AnimateDiffVideoToVideoControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## AnimateDiffPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.animatediff.AnimateDiffPipelineOutput
|
||||
|
||||
@@ -77,33 +77,16 @@ CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds o
|
||||
- `pipe.enable_model_cpu_offload()`:
|
||||
- Without enabling cpu offloading, memory usage is `33 GB`
|
||||
- With enabling cpu offloading, memory usage is `19 GB`
|
||||
- `pipe.enable_sequential_cpu_offload()`:
|
||||
- Similar to `enable_model_cpu_offload` but can significantly reduce memory usage at the cost of slow inference
|
||||
- When enabled, memory usage is under `4 GB`
|
||||
- `pipe.vae.enable_tiling()`:
|
||||
- With enabling cpu offloading and tiling, memory usage is `11 GB`
|
||||
- `pipe.vae.enable_slicing()`
|
||||
|
||||
### Quantized inference
|
||||
|
||||
[torchao](https://github.com/pytorch/ao) and [optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be used to quantize the text encoder, transformer and VAE modules to lower the memory requirements. This makes it possible to run the model on a free-tier T4 Colab or lower VRAM GPUs!
|
||||
|
||||
It is also worth noting that torchao quantization is fully compatible with [torch.compile](/optimization/torch2.0#torchcompile), which allows for much faster inference speed. Additionally, models can be serialized and stored in a quantized datatype to save disk space with torchao. Find examples and benchmarks in the gists below.
|
||||
- [torchao](https://gist.github.com/a-r-r-o-w/4d9732d17412888c885480c6521a9897)
|
||||
- [quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)
|
||||
|
||||
## CogVideoXPipeline
|
||||
|
||||
[[autodoc]] CogVideoXPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## CogVideoXVideoToVideoPipeline
|
||||
|
||||
[[autodoc]] CogVideoXVideoToVideoPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## CogVideoXPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput
|
||||
[[autodoc]] pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet with Flux.1
|
||||
|
||||
FluxControlNetPipeline is an implementation of ControlNet for Flux.1.
|
||||
|
||||
ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
|
||||
|
||||
With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
|
||||
|
||||
This controlnet code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for Flux-ControlNet in the table below:
|
||||
|
||||
|
||||
| ControlNet type | Developer | Link |
|
||||
| -------- | ---------- | ---- |
|
||||
| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny) |
|
||||
| Depth | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Depth) |
|
||||
| Union | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union) |
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## FluxControlNetPipeline
|
||||
[[autodoc]] FluxControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
## FluxPipelineOutput
|
||||
[[autodoc]] pipelines.flux.pipeline_output.FluxPipelineOutput
|
||||
@@ -163,15 +163,3 @@ image.save("flux-fp8-dev.png")
|
||||
[[autodoc]] FluxPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxImg2ImgPipeline
|
||||
|
||||
[[autodoc]] FluxImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FluxInpaintPipeline
|
||||
|
||||
[[autodoc]] FluxInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -105,11 +105,3 @@ image.save("kolors_ipa_sample.png")
|
||||
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## KolorsImg2ImgPipeline
|
||||
|
||||
[[autodoc]] KolorsImg2ImgPipeline
|
||||
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
@@ -30,64 +30,63 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
|
||||
| Pipeline | Tasks |
|
||||
|---|---|
|
||||
| [aMUSEd](amused) | text2image |
|
||||
| [AltDiffusion](alt_diffusion) | image2image |
|
||||
| [AnimateDiff](animatediff) | text2video |
|
||||
| [Attend-and-Excite](attend_and_excite) | text2image |
|
||||
| [Audio Diffusion](audio_diffusion) | image2audio |
|
||||
| [AudioLDM](audioldm) | text2audio |
|
||||
| [AudioLDM2](audioldm2) | text2audio |
|
||||
| [AuraFlow](auraflow) | text2image |
|
||||
| [BLIP Diffusion](blip_diffusion) | text2image |
|
||||
| [CogVideoX](cogvideox) | text2video |
|
||||
| [Consistency Models](consistency_models) | unconditional image generation |
|
||||
| [ControlNet](controlnet) | text2image, image2image, inpainting |
|
||||
| [ControlNet with Flux.1](controlnet_flux) | text2image |
|
||||
| [ControlNet with Hunyuan-DiT](controlnet_hunyuandit) | text2image |
|
||||
| [ControlNet with Stable Diffusion 3](controlnet_sd3) | text2image |
|
||||
| [ControlNet with Stable Diffusion XL](controlnet_sdxl) | text2image |
|
||||
| [ControlNet-XS](controlnetxs) | text2image |
|
||||
| [ControlNet-XS with Stable Diffusion XL](controlnetxs_sdxl) | text2image |
|
||||
| [Cycle Diffusion](cycle_diffusion) | image2image |
|
||||
| [Dance Diffusion](dance_diffusion) | unconditional audio generation |
|
||||
| [DDIM](ddim) | unconditional image generation |
|
||||
| [DDPM](ddpm) | unconditional image generation |
|
||||
| [DeepFloyd IF](deepfloyd_if) | text2image, image2image, inpainting, super-resolution |
|
||||
| [DiffEdit](diffedit) | inpainting |
|
||||
| [DiT](dit) | text2image |
|
||||
| [Flux](flux) | text2image |
|
||||
| [Hunyuan-DiT](hunyuandit) | text2image |
|
||||
| [I2VGen-XL](i2vgenxl) | text2video |
|
||||
| [GLIGEN](stable_diffusion/gligen) | text2image |
|
||||
| [InstructPix2Pix](pix2pix) | image editing |
|
||||
| [Kandinsky 2.1](kandinsky) | text2image, image2image, inpainting, interpolation |
|
||||
| [Kandinsky 2.2](kandinsky_v22) | text2image, image2image, inpainting |
|
||||
| [Kandinsky 3](kandinsky3) | text2image, image2image |
|
||||
| [Kolors](kolors) | text2image |
|
||||
| [Latent Consistency Models](latent_consistency_models) | text2image |
|
||||
| [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
|
||||
| [Latte](latte) | text2image |
|
||||
| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
|
||||
| [LEDITS++](ledits_pp) | image editing |
|
||||
| [Lumina-T2X](lumina) | text2image |
|
||||
| [Marigold](marigold) | depth |
|
||||
| [MultiDiffusion](panorama) | text2image |
|
||||
| [MusicLDM](musicldm) | text2audio |
|
||||
| [PAG](pag) | text2image |
|
||||
| [Paint by Example](paint_by_example) | inpainting |
|
||||
| [PIA](pia) | image2video |
|
||||
| [ParaDiGMS](paradigms) | text2image |
|
||||
| [Pix2Pix Zero](pix2pix_zero) | image editing |
|
||||
| [PixArt-α](pixart) | text2image |
|
||||
| [PixArt-Σ](pixart_sigma) | text2image |
|
||||
| [PNDM](pndm) | unconditional image generation |
|
||||
| [RePaint](repaint) | inpainting |
|
||||
| [Score SDE VE](score_sde_ve) | unconditional image generation |
|
||||
| [Self-Attention Guidance](self_attention_guidance) | text2image |
|
||||
| [Semantic Guidance](semantic_stable_diffusion) | text2image |
|
||||
| [Shap-E](shap_e) | text-to-3D, image-to-3D |
|
||||
| [Spectrogram Diffusion](spectrogram_diffusion) | |
|
||||
| [Stable Audio](stable_audio) | text2audio |
|
||||
| [Stable Cascade](stable_cascade) | text2image |
|
||||
| [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
|
||||
| [Stable Diffusion Model Editing](model_editing) | model editing |
|
||||
| [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
|
||||
| [Stable Diffusion XL Turbo](stable_diffusion/sdxl_turbo) | text2image, image2image, inpainting |
|
||||
| [Stable unCLIP](stable_unclip) | text2image, image variation |
|
||||
| [Stochastic Karras VE](stochastic_karras_ve) | unconditional image generation |
|
||||
| [T2I-Adapter](stable_diffusion/adapter) | text2image |
|
||||
| [Text2Video](text_to_video) | text2video, video2video |
|
||||
| [Text2Video-Zero](text_to_video_zero) | text2video |
|
||||
| [unCLIP](unclip) | text2image, image variation |
|
||||
| [Unconditional Latent Diffusion](latent_diffusion_uncond) | unconditional image generation |
|
||||
| [UniDiffuser](unidiffuser) | text2image, image2text, image variation, text variation, unconditional image generation, unconditional audio generation |
|
||||
| [Value-guided planning](value_guided_sampling) | value guided sampling |
|
||||
| [Versatile Diffusion](versatile_diffusion) | text2image, image variation |
|
||||
| [VQ Diffusion](vq_diffusion) | text2image |
|
||||
| [Wuerstchen](wuerstchen) | text2image |
|
||||
|
||||
## DiffusionPipeline
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
# xDiT
|
||||
|
||||
[xDiT](https://github.com/xdit-project/xDiT) is an inference engine designed for the large scale parallel deployment of Diffusion Transformers (DiTs). xDiT provides a suite of efficient parallel approaches for Diffusion Models, as well as GPU kernel accelerations.
|
||||
|
||||
There are four parallel methods supported in xDiT, including [Unified Sequence Parallelism](https://arxiv.org/abs/2405.07719), [PipeFusion](https://arxiv.org/abs/2405.14430), CFG parallelism and data parallelism. The four parallel methods in xDiT can be configured in a hybrid manner, optimizing communication patterns to best suit the underlying network hardware.
|
||||
|
||||
Optimization orthogonal to parallelization focuses on accelerating single GPU performance. In addition to utilizing well-known Attention optimization libraries, we leverage compilation acceleration technologies such as torch.compile and onediff.
|
||||
|
||||
The overview of xDiT is shown as follows.
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/methods/xdit_overview.png">
|
||||
</div>
|
||||
You can install xDiT using the following command:
|
||||
|
||||
|
||||
```bash
|
||||
pip install xfuser
|
||||
```
|
||||
|
||||
Here's an example of using xDiT to accelerate inference of a Diffusers model.
|
||||
|
||||
```diff
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
|
||||
from xfuser import xFuserArgs, xDiTParallel
|
||||
from xfuser.config import FlexibleArgumentParser
|
||||
from xfuser.core.distributed import get_world_group
|
||||
|
||||
def main():
|
||||
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
|
||||
+ args = xFuserArgs.add_cli_args(parser).parse_args()
|
||||
+ engine_args = xFuserArgs.from_cli_args(args)
|
||||
+ engine_config, input_config = engine_args.create_config()
|
||||
|
||||
local_rank = get_world_group().local_rank
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained(
|
||||
pretrained_model_name_or_path=engine_config.model_config.model,
|
||||
torch_dtype=torch.float16,
|
||||
).to(f"cuda:{local_rank}")
|
||||
|
||||
# do anything you want with pipeline here
|
||||
|
||||
+ pipe = xDiTParallel(pipe, engine_config, input_config)
|
||||
|
||||
pipe(
|
||||
height=input_config.height,
|
||||
width=input_config.height,
|
||||
prompt=input_config.prompt,
|
||||
num_inference_steps=input_config.num_inference_steps,
|
||||
output_type=input_config.output_type,
|
||||
generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
|
||||
)
|
||||
|
||||
+ if input_config.output_type == "pil":
|
||||
+ pipe.save("results", "stable_diffusion_3")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
```
|
||||
|
||||
As you can see, we only need to use xFuserArgs from xDiT to get configuration parameters, and pass these parameters along with the pipeline object from the Diffusers library into xDiTParallel to complete the parallelization of a specific pipeline in Diffusers.
|
||||
|
||||
xDiT runtime parameters can be viewed in the command line using `-h`, and you can refer to this [usage](https://github.com/xdit-project/xDiT?tab=readme-ov-file#2-usage) example for more details.
|
||||
|
||||
xDiT needs to be launched using torchrun to support its multi-node, multi-GPU parallel capabilities. For example, the following command can be used for 8-GPU parallel inference:
|
||||
|
||||
```bash
|
||||
torchrun --nproc_per_node=8 ./inference.py --model models/FLUX.1-dev --data_parallel_degree 2 --ulysses_degree 2 --ring_degree 2 --prompt "A snowy mountain" "A small dog" --num_inference_steps 50
|
||||
```
|
||||
|
||||
## Supported models
|
||||
|
||||
A subset of Diffusers models are supported in xDiT, such as Flux.1, Stable Diffusion 3, etc. The latest supported models can be found [here](https://github.com/xdit-project/xDiT?tab=readme-ov-file#-supported-dits).
|
||||
|
||||
## Benchmark
|
||||
We tested different models on various machines, and here is some of the benchmark data.
|
||||
|
||||
|
||||
### Flux.1-schnell
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/flux/Flux-2k-L40.png">
|
||||
</div>
|
||||
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/flux/Flux-2K-A100.png">
|
||||
</div>
|
||||
|
||||
### Stable Diffusion 3
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/sd3/L40-SD3.png">
|
||||
</div>
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/sd3/A100-SD3.png">
|
||||
</div>
|
||||
|
||||
### HunyuanDiT
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/hunuyuandit/L40-HunyuanDiT.png">
|
||||
</div>
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/hunuyuandit/A100-HunyuanDiT.png">
|
||||
</div>
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://github.com/xdit-project/xDiT/raw/main/assets/performance/hunuyuandit/T4-HunyuanDiT.png">
|
||||
</div>
|
||||
|
||||
More detailed performance metric can be found on our [github page](https://github.com/xdit-project/xDiT?tab=readme-ov-file#perf).
|
||||
|
||||
## Reference
|
||||
|
||||
[xDiT-project](https://github.com/xdit-project/xDiT)
|
||||
|
||||
[USP: A Unified Sequence Parallelism Approach for Long Context Generative AI](https://arxiv.org/abs/2405.07719)
|
||||
|
||||
[PipeFusion: Displaced Patch Pipeline Parallelism for Inference of Diffusion Transformer Models](https://arxiv.org/abs/2405.14430)
|
||||
@@ -1,152 +0,0 @@
|
||||
# ControlNet training example for Stable Diffusion 3 (SD3)
|
||||
|
||||
The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
|
||||
|
||||
## Running locally with PyTorch
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
**Important**
|
||||
|
||||
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Then cd in the `examples/controlnet` folder and run
|
||||
```bash
|
||||
pip install -r requirements_sd3.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Or for a default accelerate configuration without answering questions about your environment
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell (e.g., a notebook)
|
||||
|
||||
```python
|
||||
from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
|
||||
|
||||
## Circle filling dataset
|
||||
|
||||
The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
|
||||
Please download the dataset and unzip it in the directory `fill50k` in the `examples/controlnet` folder.
|
||||
|
||||
## Training
|
||||
|
||||
First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium). We will use it as a base model for the ControlNet training.
|
||||
> [!NOTE]
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
|
||||
|
||||
|
||||
Our training examples use two test conditioning images. They can be downloaded by running
|
||||
|
||||
```sh
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
|
||||
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
|
||||
```
|
||||
|
||||
Then run the following commands to train a ControlNet model.
|
||||
|
||||
```bash
|
||||
export MODEL_DIR="stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
export OUTPUT_DIR="sd3-controlnet-out"
|
||||
|
||||
accelerate launch train_controlnet_sd3.py \
|
||||
--pretrained_model_name_or_path=$MODEL_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--train_data_dir="fill50k" \
|
||||
--resolution=1024 \
|
||||
--learning_rate=1e-5 \
|
||||
--max_train_steps=15000 \
|
||||
--validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
|
||||
--validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
|
||||
--validation_steps=100 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
Our experiments were conducted on a single 40GB A100 GPU.
|
||||
|
||||
### Inference
|
||||
|
||||
Once training is done, we can perform inference like so:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusion3ControlNetPipeline, SD3ControlNetModel
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
controlnet_path = "sd3-controlnet-out/checkpoint-6500/controlnet"
|
||||
|
||||
controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
|
||||
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
|
||||
base_model_path, controlnet=controlnet
|
||||
)
|
||||
pipe.to("cuda", torch.float16)
|
||||
|
||||
|
||||
control_image = load_image("./conditioning_image_1.png").resize((1024, 1024))
|
||||
prompt = "pale golden rod circle with old lace background"
|
||||
|
||||
# generate image
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=20, generator=generator, control_image=control_image
|
||||
).images[0]
|
||||
image.save("./output.png")
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
### GPU usage
|
||||
|
||||
SD3 is a large model and requires a lot of GPU memory.
|
||||
We recommend using one GPU with at least 80GB of memory.
|
||||
Make sure to use the right GPU when configuring the [accelerator](https://huggingface.co/docs/transformers/en/accelerate).
|
||||
|
||||
|
||||
## Example results
|
||||
|
||||
#### After 500 steps with batch size 8
|
||||
|
||||
| | |
|
||||
|-------------------|:-------------------------:|
|
||||
|| pale golden rod circle with old lace background |
|
||||
 |  |
|
||||
|
||||
|
||||
#### After 6500 steps with batch size 8:
|
||||
|
||||
| | |
|
||||
|-------------------|:-------------------------:|
|
||||
|| pale golden rod circle with old lace background |
|
||||
 |  |
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
accelerate>=0.16.0
|
||||
torchvision
|
||||
transformers>=4.25.1
|
||||
ftfy
|
||||
tensorboard
|
||||
Jinja2
|
||||
datasets
|
||||
wandb
|
||||
@@ -115,24 +115,3 @@ class ControlNetSDXL(ExamplesTestsAccelerate):
|
||||
run_command(self._launch_args + test_args)
|
||||
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
|
||||
|
||||
|
||||
class ControlNetSD3(ExamplesTestsAccelerate):
|
||||
def test_controlnet_sd3(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
examples/controlnet/train_controlnet_sd3.py
|
||||
--pretrained_model_name_or_path=DavyMorgan/tiny-sd3-pipe
|
||||
--dataset_name=hf-internal-testing/fill10
|
||||
--output_dir={tmpdir}
|
||||
--resolution=64
|
||||
--train_batch_size=1
|
||||
--gradient_accumulation_steps=1
|
||||
--controlnet_model_name_or_path=DavyMorgan/tiny-controlnet-sd3
|
||||
--max_train_steps=4
|
||||
--checkpointing_steps=2
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -314,12 +314,11 @@ def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_di
|
||||
for x, y in zip(modifier_token_id, args.modifier_token):
|
||||
learned_embeds_dict = {}
|
||||
learned_embeds_dict[y] = learned_embeds[x]
|
||||
filename = f"{output_dir}/{y}.bin"
|
||||
|
||||
if safe_serialization:
|
||||
filename = f"{output_dir}/{y}.safetensors"
|
||||
safetensors.torch.save_file(learned_embeds_dict, filename, metadata={"format": "pt"})
|
||||
else:
|
||||
filename = f"{output_dir}/{y}.bin"
|
||||
torch.save(learned_embeds_dict, filename)
|
||||
|
||||
|
||||
@@ -1041,22 +1040,17 @@ def main(args):
|
||||
)
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
# Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
|
||||
num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
|
||||
overrode_max_train_steps = False
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
if args.max_train_steps is None:
|
||||
len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
|
||||
num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
|
||||
num_training_steps_for_scheduler = (
|
||||
args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
|
||||
)
|
||||
else:
|
||||
num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
overrode_max_train_steps = True
|
||||
|
||||
lr_scheduler = get_scheduler(
|
||||
args.lr_scheduler,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=num_warmup_steps_for_scheduler,
|
||||
num_training_steps=num_training_steps_for_scheduler,
|
||||
num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
|
||||
num_training_steps=args.max_train_steps * accelerator.num_processes,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -1071,14 +1065,8 @@ def main(args):
|
||||
|
||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
if args.max_train_steps is None:
|
||||
if overrode_max_train_steps:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
|
||||
logger.warning(
|
||||
f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
|
||||
f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
|
||||
f"This inconsistency may result in the learning rate scheduler not functioning properly."
|
||||
)
|
||||
# Afterwards we recalculate our number of training epochs
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
|
||||
@@ -8,10 +8,8 @@ The `train_dreambooth_flux.py` script shows how to implement the training proced
|
||||
>
|
||||
> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
|
||||
> a LoRA with a rank of 16 (w/ all components trained) can exceed 40GB of VRAM for training.
|
||||
> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
|
||||
|
||||
> For more tips & guidance on training on a resource-constrained device and general good practices please check out these great guides and trainers for FLUX:
|
||||
> 1) [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
|
||||
> 2) [`ostris`'s guide](https://github.com/ostris/ai-toolkit?tab=readme-ov-file#flux1-training)
|
||||
|
||||
> [!NOTE]
|
||||
> **Gated model**
|
||||
@@ -102,10 +100,8 @@ accelerate launch train_dreambooth_flux.py \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=1024 \
|
||||
--train_batch_size=1 \
|
||||
--guidance_scale=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--optimizer="prodigy" \
|
||||
--learning_rate=1. \
|
||||
--learning_rate=1e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
@@ -124,23 +120,15 @@ To better track our training experiments, we're using the following flags in the
|
||||
> [!NOTE]
|
||||
> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
|
||||
|
||||
> [!TIP]
|
||||
> You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.
|
||||
|
||||
## LoRA + DreamBooth
|
||||
|
||||
[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
|
||||
|
||||
Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
|
||||
|
||||
### Prodigy Optimizer
|
||||
Prodigy is an adaptive optimizer that dynamically adjusts the learning rate learned parameters based on past gradients, allowing for more efficient convergence.
|
||||
By using prodigy we can "eliminate" the need for manual learning rate tuning. read more [here](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers).
|
||||
|
||||
to use prodigy, specify
|
||||
```bash
|
||||
--optimizer="prodigy"
|
||||
```
|
||||
> [!TIP]
|
||||
> When using prodigy it's generally good practice to set- `--learning_rate=1.0`
|
||||
|
||||
To perform DreamBooth with LoRA, run:
|
||||
|
||||
```bash
|
||||
@@ -156,10 +144,8 @@ accelerate launch train_dreambooth_lora_flux.py \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--guidance_scale=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--optimizer="prodigy" \
|
||||
--learning_rate=1. \
|
||||
--learning_rate=1e-5 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
@@ -176,7 +162,6 @@ Alongside the transformer, fine-tuning of the CLIP text encoder is also supporte
|
||||
To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
|
||||
|
||||
> [!NOTE]
|
||||
> This is still an experimental feature.
|
||||
> FLUX.1 has 2 text encoders (CLIP L/14 and T5-v1.1-XXL).
|
||||
By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
|
||||
> At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
|
||||
@@ -195,10 +180,8 @@ accelerate launch train_dreambooth_lora_flux.py \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--guidance_scale=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--optimizer="prodigy" \
|
||||
--learning_rate=1. \
|
||||
--learning_rate=1e-5 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
@@ -208,21 +191,5 @@ accelerate launch train_dreambooth_lora_flux.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
## Memory Optimizations
|
||||
As mentioned, Flux Dreambooth LoRA training is very memory intensive Here are some options (some still experimental) for a more memory efficient training.
|
||||
### Image Resolution
|
||||
An easy way to mitigate some of the memory requirements is through `--resolution`. `--resolution` refers to the resolution for input images, all the images in the train/validation dataset are resized to this.
|
||||
Note that by default, images are resized to resolution of 512, but it's good to keep in mind in case you're accustomed to training on higher resolutions.
|
||||
### Gradient Checkpointing and Accumulation
|
||||
* `--gradient accumulation` refers to the number of updates steps to accumulate before performing a backward/update pass.
|
||||
by passing a value > 1 you can reduce the amount of backward/update passes and hence also memory reqs.
|
||||
* with `--gradient checkpointing` we can save memory by not storing all intermediate activations during the forward pass.
|
||||
Instead, only a subset of these activations (the checkpoints) are stored and the rest is recomputed as needed during the backward pass. Note that this comes at the expanse of a slower backward pass.
|
||||
### 8-bit-Adam Optimizer
|
||||
When training with `AdamW`(doesn't apply to `prodigy`) You can pass `--use_8bit_adam` to reduce the memory requirements of training.
|
||||
Make sure to install `bitsandbytes` if you want to do so.
|
||||
### latent caching
|
||||
When training w/o validation runs, we can pre-encode the training images with the vae, and then delete it to free up some memory.
|
||||
to enable `latent_caching`, first, use the version in [this PR](https://github.com/huggingface/diffusers/blob/1b195933d04e4c8281a2634128c0d2d380893f73/examples/dreambooth/train_dreambooth_lora_flux.py), and then pass `--cache_latents`
|
||||
## Other notes
|
||||
Thanks to `bghira` and `ostris` for their help with reviewing & insight sharing ♥️
|
||||
Thanks to `bghira` for their help with reviewing & insight sharing ♥️
|
||||
@@ -842,7 +842,7 @@ class PromptDataset(Dataset):
|
||||
return example
|
||||
|
||||
|
||||
def tokenize_prompt(tokenizer, prompt, max_sequence_length):
|
||||
def tokenize_prompt(tokenizer, prompt, max_sequence_length=512):
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
@@ -863,26 +863,20 @@ def _encode_prompt_with_t5(
|
||||
prompt=None,
|
||||
num_images_per_prompt=1,
|
||||
device=None,
|
||||
text_input_ids=None,
|
||||
):
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
|
||||
if tokenizer is not None:
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=max_sequence_length,
|
||||
truncation=True,
|
||||
return_length=False,
|
||||
return_overflowing_tokens=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
else:
|
||||
if text_input_ids is None:
|
||||
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
||||
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=max_sequence_length,
|
||||
truncation=True,
|
||||
return_length=False,
|
||||
return_overflowing_tokens=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
prompt_embeds = text_encoder(text_input_ids.to(device))[0]
|
||||
|
||||
dtype = text_encoder.dtype
|
||||
@@ -902,28 +896,22 @@ def _encode_prompt_with_clip(
|
||||
tokenizer,
|
||||
prompt: str,
|
||||
device=None,
|
||||
text_input_ids=None,
|
||||
num_images_per_prompt: int = 1,
|
||||
):
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
|
||||
if tokenizer is not None:
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=77,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=False,
|
||||
return_length=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
text_input_ids = text_inputs.input_ids
|
||||
else:
|
||||
if text_input_ids is None:
|
||||
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=77,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=False,
|
||||
return_length=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
text_input_ids = text_inputs.input_ids
|
||||
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
|
||||
|
||||
# Use pooled output of CLIPTextModel
|
||||
@@ -944,19 +932,17 @@ def encode_prompt(
|
||||
max_sequence_length,
|
||||
device=None,
|
||||
num_images_per_prompt: int = 1,
|
||||
text_input_ids_list=None,
|
||||
):
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
dtype = text_encoders[0].dtype
|
||||
device = device if device is not None else text_encoders[1].device
|
||||
|
||||
pooled_prompt_embeds = _encode_prompt_with_clip(
|
||||
text_encoder=text_encoders[0],
|
||||
tokenizer=tokenizers[0],
|
||||
prompt=prompt,
|
||||
device=device,
|
||||
device=device if device is not None else text_encoders[0].device,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
|
||||
)
|
||||
|
||||
prompt_embeds = _encode_prompt_with_t5(
|
||||
@@ -965,8 +951,7 @@ def encode_prompt(
|
||||
max_sequence_length=max_sequence_length,
|
||||
prompt=prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
device=device,
|
||||
text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
|
||||
device=device if device is not None else text_encoders[1].device,
|
||||
)
|
||||
|
||||
text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
|
||||
@@ -1514,25 +1499,7 @@ def main(args):
|
||||
)
|
||||
else:
|
||||
tokens_one = tokenize_prompt(tokenizer_one, prompts, max_sequence_length=77)
|
||||
tokens_two = tokenize_prompt(
|
||||
tokenizer_two, prompts, max_sequence_length=args.max_sequence_length
|
||||
)
|
||||
prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
|
||||
text_encoders=[text_encoder_one, text_encoder_two],
|
||||
tokenizers=[None, None],
|
||||
text_input_ids_list=[tokens_one, tokens_two],
|
||||
max_sequence_length=args.max_sequence_length,
|
||||
prompt=prompts,
|
||||
)
|
||||
else:
|
||||
if args.train_text_encoder:
|
||||
prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
|
||||
text_encoders=[text_encoder_one, text_encoder_two],
|
||||
tokenizers=[None, None],
|
||||
text_input_ids_list=[tokens_one, tokens_two],
|
||||
max_sequence_length=args.max_sequence_length,
|
||||
prompt=args.instance_prompt,
|
||||
)
|
||||
tokens_two = tokenize_prompt(tokenizer_two, prompts, max_sequence_length=512)
|
||||
|
||||
# Convert images to latent space
|
||||
model_input = vae.encode(pixel_values).latent_dist.sample()
|
||||
@@ -1586,22 +1553,41 @@ def main(args):
|
||||
guidance = None
|
||||
|
||||
# Predict the noise residual
|
||||
model_pred = transformer(
|
||||
hidden_states=packed_noisy_model_input,
|
||||
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
|
||||
timestep=timesteps / 1000,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_prompt_embeds,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
txt_ids=text_ids,
|
||||
img_ids=latent_image_ids,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
# upscaling height & width as discussed in https://github.com/huggingface/diffusers/pull/9257#discussion_r1731108042
|
||||
if not args.train_text_encoder:
|
||||
model_pred = transformer(
|
||||
hidden_states=packed_noisy_model_input,
|
||||
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
|
||||
timestep=timesteps / 1000,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_prompt_embeds,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
txt_ids=text_ids,
|
||||
img_ids=latent_image_ids,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
else:
|
||||
prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
|
||||
text_encoders=[text_encoder_one, text_encoder_two],
|
||||
tokenizers=None,
|
||||
prompt=None,
|
||||
text_input_ids_list=[tokens_one, tokens_two],
|
||||
)
|
||||
model_pred = transformer(
|
||||
hidden_states=packed_noisy_model_input,
|
||||
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
|
||||
timestep=timesteps / 1000,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_prompt_embeds,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
txt_ids=text_ids,
|
||||
img_ids=latent_image_ids,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
model_pred = FluxPipeline._unpack_latents(
|
||||
model_pred,
|
||||
height=int(model_input.shape[2] * vae_scale_factor / 2),
|
||||
width=int(model_input.shape[3] * vae_scale_factor / 2),
|
||||
height=int(model_input.shape[2]),
|
||||
width=int(model_input.shape[3]),
|
||||
vae_scale_factor=vae_scale_factor,
|
||||
)
|
||||
|
||||
|
||||
@@ -1597,7 +1597,6 @@ def main(args):
|
||||
tokenizers=[None, None],
|
||||
text_input_ids_list=[tokens_one, tokens_two],
|
||||
max_sequence_length=args.max_sequence_length,
|
||||
device=accelerator.device,
|
||||
prompt=prompts,
|
||||
)
|
||||
else:
|
||||
@@ -1607,7 +1606,6 @@ def main(args):
|
||||
tokenizers=[None, None],
|
||||
text_input_ids_list=[tokens_one, tokens_two],
|
||||
max_sequence_length=args.max_sequence_length,
|
||||
device=accelerator.device,
|
||||
prompt=args.instance_prompt,
|
||||
)
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import gc
|
||||
import itertools
|
||||
import logging
|
||||
import math
|
||||
@@ -55,7 +56,6 @@ from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import (
|
||||
_set_state_dict_into_text_encoder,
|
||||
cast_training_params,
|
||||
clear_objs_and_retain_memory,
|
||||
compute_density_for_timestep_sampling,
|
||||
compute_loss_weighting_for_sd3,
|
||||
)
|
||||
@@ -210,7 +210,9 @@ def log_validation(
|
||||
}
|
||||
)
|
||||
|
||||
clear_objs_and_retain_memory(objs=[pipeline])
|
||||
del pipeline
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return images
|
||||
|
||||
@@ -1105,7 +1107,9 @@ def main(args):
|
||||
image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
|
||||
image.save(image_filename)
|
||||
|
||||
clear_objs_and_retain_memory(objs=[pipeline])
|
||||
del pipeline
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
@@ -1451,10 +1455,12 @@ def main(args):
|
||||
|
||||
# Clear the memory here
|
||||
if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
|
||||
del tokenizers, text_encoders
|
||||
# Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
|
||||
clear_objs_and_retain_memory(
|
||||
objs=[tokenizers, text_encoders, text_encoder_one, text_encoder_two, text_encoder_three]
|
||||
)
|
||||
del text_encoder_one, text_encoder_two, text_encoder_three
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
|
||||
# pack the statically computed variables appropriately here. This is so that we don't
|
||||
@@ -1789,11 +1795,11 @@ def main(args):
|
||||
pipeline_args=pipeline_args,
|
||||
epoch=epoch,
|
||||
)
|
||||
objs = []
|
||||
if not args.train_text_encoder:
|
||||
objs.extend([text_encoder_one, text_encoder_two, text_encoder_three])
|
||||
del text_encoder_one, text_encoder_two, text_encoder_three
|
||||
|
||||
clear_objs_and_retain_memory(objs=objs)
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
# Save the lora layers
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
@@ -89,7 +89,6 @@ else:
|
||||
"ControlNetXSAdapter",
|
||||
"DiTTransformer2DModel",
|
||||
"FluxControlNetModel",
|
||||
"FluxMultiControlNetModel",
|
||||
"FluxTransformer2DModel",
|
||||
"HunyuanDiT2DControlNetModel",
|
||||
"HunyuanDiT2DModel",
|
||||
@@ -245,7 +244,6 @@ else:
|
||||
"AnimateDiffPipeline",
|
||||
"AnimateDiffSDXLPipeline",
|
||||
"AnimateDiffSparseControlNetPipeline",
|
||||
"AnimateDiffVideoToVideoControlNetPipeline",
|
||||
"AnimateDiffVideoToVideoPipeline",
|
||||
"AudioLDM2Pipeline",
|
||||
"AudioLDM2ProjectionModel",
|
||||
@@ -256,11 +254,8 @@ else:
|
||||
"BlipDiffusionPipeline",
|
||||
"CLIPImageProjection",
|
||||
"CogVideoXPipeline",
|
||||
"CogVideoXVideoToVideoPipeline",
|
||||
"CycleDiffusionPipeline",
|
||||
"FluxControlNetPipeline",
|
||||
"FluxImg2ImgPipeline",
|
||||
"FluxInpaintPipeline",
|
||||
"FluxPipeline",
|
||||
"HunyuanDiTControlNetPipeline",
|
||||
"HunyuanDiTPAGPipeline",
|
||||
@@ -559,7 +554,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
ControlNetXSAdapter,
|
||||
DiTTransformer2DModel,
|
||||
FluxControlNetModel,
|
||||
FluxMultiControlNetModel,
|
||||
FluxTransformer2DModel,
|
||||
HunyuanDiT2DControlNetModel,
|
||||
HunyuanDiT2DModel,
|
||||
@@ -695,7 +689,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
AnimateDiffPipeline,
|
||||
AnimateDiffSDXLPipeline,
|
||||
AnimateDiffSparseControlNetPipeline,
|
||||
AnimateDiffVideoToVideoControlNetPipeline,
|
||||
AnimateDiffVideoToVideoPipeline,
|
||||
AudioLDM2Pipeline,
|
||||
AudioLDM2ProjectionModel,
|
||||
@@ -704,11 +697,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
AuraFlowPipeline,
|
||||
CLIPImageProjection,
|
||||
CogVideoXPipeline,
|
||||
CogVideoXVideoToVideoPipeline,
|
||||
CycleDiffusionPipeline,
|
||||
FluxControlNetPipeline,
|
||||
FluxImg2ImgPipeline,
|
||||
FluxInpaintPipeline,
|
||||
FluxPipeline,
|
||||
HunyuanDiTControlNetPipeline,
|
||||
HunyuanDiTPAGPipeline,
|
||||
|
||||
@@ -569,7 +569,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
|
||||
channel = image.shape[1]
|
||||
# don't need any preprocess if the image is latents
|
||||
if channel == self.config.vae_latent_channels:
|
||||
if channel == self.vae_latent_channels:
|
||||
return image
|
||||
|
||||
height, width = self.get_default_height_width(image, height, width)
|
||||
|
||||
@@ -208,8 +208,6 @@ class IPAdapterMixin:
|
||||
pretrained_model_name_or_path_or_dict,
|
||||
subfolder=image_encoder_subfolder,
|
||||
low_cpu_mem_usage=low_cpu_mem_usage,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
).to(self.device, dtype=self.dtype)
|
||||
self.register_modules(image_encoder=image_encoder)
|
||||
else:
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
import re
|
||||
|
||||
import torch
|
||||
|
||||
from ..utils import is_peft_version, logging
|
||||
|
||||
|
||||
@@ -328,296 +326,3 @@ def _get_alpha_name(lora_name_alpha, diffusers_name, alpha):
|
||||
prefix = "text_encoder_2."
|
||||
new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
|
||||
return {new_name: alpha}
|
||||
|
||||
|
||||
# The utilities under `_convert_kohya_flux_lora_to_diffusers()`
|
||||
# are taken from https://github.com/kohya-ss/sd-scripts/blob/a61cf73a5cb5209c3f4d1a3688dd276a4dfd1ecb/networks/convert_flux_lora.py
|
||||
# All credits go to `kohya-ss`.
|
||||
def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
def _convert_to_ai_toolkit(sds_sd, ait_sd, sds_key, ait_key):
|
||||
if sds_key + ".lora_down.weight" not in sds_sd:
|
||||
return
|
||||
down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
|
||||
|
||||
# scale weight by alpha and dim
|
||||
rank = down_weight.shape[0]
|
||||
alpha = sds_sd.pop(sds_key + ".alpha").item() # alpha is scalar
|
||||
scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
|
||||
|
||||
# calculate scale_down and scale_up to keep the same value. if scale is 4, scale_down is 2 and scale_up is 2
|
||||
scale_down = scale
|
||||
scale_up = 1.0
|
||||
while scale_down * 2 < scale_up:
|
||||
scale_down *= 2
|
||||
scale_up /= 2
|
||||
|
||||
ait_sd[ait_key + ".lora_A.weight"] = down_weight * scale_down
|
||||
ait_sd[ait_key + ".lora_B.weight"] = sds_sd.pop(sds_key + ".lora_up.weight") * scale_up
|
||||
|
||||
def _convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
|
||||
if sds_key + ".lora_down.weight" not in sds_sd:
|
||||
return
|
||||
down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
|
||||
up_weight = sds_sd.pop(sds_key + ".lora_up.weight")
|
||||
sd_lora_rank = down_weight.shape[0]
|
||||
|
||||
# scale weight by alpha and dim
|
||||
alpha = sds_sd.pop(sds_key + ".alpha")
|
||||
scale = alpha / sd_lora_rank
|
||||
|
||||
# calculate scale_down and scale_up
|
||||
scale_down = scale
|
||||
scale_up = 1.0
|
||||
while scale_down * 2 < scale_up:
|
||||
scale_down *= 2
|
||||
scale_up /= 2
|
||||
|
||||
down_weight = down_weight * scale_down
|
||||
up_weight = up_weight * scale_up
|
||||
|
||||
# calculate dims if not provided
|
||||
num_splits = len(ait_keys)
|
||||
if dims is None:
|
||||
dims = [up_weight.shape[0] // num_splits] * num_splits
|
||||
else:
|
||||
assert sum(dims) == up_weight.shape[0]
|
||||
|
||||
# check upweight is sparse or not
|
||||
is_sparse = False
|
||||
if sd_lora_rank % num_splits == 0:
|
||||
ait_rank = sd_lora_rank // num_splits
|
||||
is_sparse = True
|
||||
i = 0
|
||||
for j in range(len(dims)):
|
||||
for k in range(len(dims)):
|
||||
if j == k:
|
||||
continue
|
||||
is_sparse = is_sparse and torch.all(
|
||||
up_weight[i : i + dims[j], k * ait_rank : (k + 1) * ait_rank] == 0
|
||||
)
|
||||
i += dims[j]
|
||||
if is_sparse:
|
||||
logger.info(f"weight is sparse: {sds_key}")
|
||||
|
||||
# make ai-toolkit weight
|
||||
ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
|
||||
ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
|
||||
if not is_sparse:
|
||||
# down_weight is copied to each split
|
||||
ait_sd.update({k: down_weight for k in ait_down_keys})
|
||||
|
||||
# up_weight is split to each split
|
||||
ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))}) # noqa: C416
|
||||
else:
|
||||
# down_weight is chunked to each split
|
||||
ait_sd.update({k: v for k, v in zip(ait_down_keys, torch.chunk(down_weight, num_splits, dim=0))}) # noqa: C416
|
||||
|
||||
# up_weight is sparse: only non-zero values are copied to each split
|
||||
i = 0
|
||||
for j in range(len(dims)):
|
||||
ait_sd[ait_up_keys[j]] = up_weight[i : i + dims[j], j * ait_rank : (j + 1) * ait_rank].contiguous()
|
||||
i += dims[j]
|
||||
|
||||
def _convert_sd_scripts_to_ai_toolkit(sds_sd):
|
||||
ait_sd = {}
|
||||
for i in range(19):
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_img_attn_proj",
|
||||
f"transformer.transformer_blocks.{i}.attn.to_out.0",
|
||||
)
|
||||
_convert_to_ai_toolkit_cat(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_img_attn_qkv",
|
||||
[
|
||||
f"transformer.transformer_blocks.{i}.attn.to_q",
|
||||
f"transformer.transformer_blocks.{i}.attn.to_k",
|
||||
f"transformer.transformer_blocks.{i}.attn.to_v",
|
||||
],
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_img_mlp_0",
|
||||
f"transformer.transformer_blocks.{i}.ff.net.0.proj",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_img_mlp_2",
|
||||
f"transformer.transformer_blocks.{i}.ff.net.2",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_img_mod_lin",
|
||||
f"transformer.transformer_blocks.{i}.norm1.linear",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_txt_attn_proj",
|
||||
f"transformer.transformer_blocks.{i}.attn.to_add_out",
|
||||
)
|
||||
_convert_to_ai_toolkit_cat(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_txt_attn_qkv",
|
||||
[
|
||||
f"transformer.transformer_blocks.{i}.attn.add_q_proj",
|
||||
f"transformer.transformer_blocks.{i}.attn.add_k_proj",
|
||||
f"transformer.transformer_blocks.{i}.attn.add_v_proj",
|
||||
],
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_txt_mlp_0",
|
||||
f"transformer.transformer_blocks.{i}.ff_context.net.0.proj",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_txt_mlp_2",
|
||||
f"transformer.transformer_blocks.{i}.ff_context.net.2",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_double_blocks_{i}_txt_mod_lin",
|
||||
f"transformer.transformer_blocks.{i}.norm1_context.linear",
|
||||
)
|
||||
|
||||
for i in range(38):
|
||||
_convert_to_ai_toolkit_cat(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_single_blocks_{i}_linear1",
|
||||
[
|
||||
f"transformer.single_transformer_blocks.{i}.attn.to_q",
|
||||
f"transformer.single_transformer_blocks.{i}.attn.to_k",
|
||||
f"transformer.single_transformer_blocks.{i}.attn.to_v",
|
||||
f"transformer.single_transformer_blocks.{i}.proj_mlp",
|
||||
],
|
||||
dims=[3072, 3072, 3072, 12288],
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_single_blocks_{i}_linear2",
|
||||
f"transformer.single_transformer_blocks.{i}.proj_out",
|
||||
)
|
||||
_convert_to_ai_toolkit(
|
||||
sds_sd,
|
||||
ait_sd,
|
||||
f"lora_unet_single_blocks_{i}_modulation_lin",
|
||||
f"transformer.single_transformer_blocks.{i}.norm.linear",
|
||||
)
|
||||
|
||||
if len(sds_sd) > 0:
|
||||
logger.warning(f"Unsuppored keys for ai-toolkit: {sds_sd.keys()}")
|
||||
|
||||
return ait_sd
|
||||
|
||||
return _convert_sd_scripts_to_ai_toolkit(state_dict)
|
||||
|
||||
|
||||
# Adapted from https://gist.github.com/Leommm-byte/6b331a1e9bd53271210b26543a7065d6
|
||||
# Some utilities were reused from
|
||||
# https://github.com/kohya-ss/sd-scripts/blob/a61cf73a5cb5209c3f4d1a3688dd276a4dfd1ecb/networks/convert_flux_lora.py
|
||||
def _convert_xlabs_flux_lora_to_diffusers(old_state_dict):
|
||||
new_state_dict = {}
|
||||
orig_keys = list(old_state_dict.keys())
|
||||
|
||||
def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
|
||||
down_weight = sds_sd.pop(sds_key)
|
||||
up_weight = sds_sd.pop(sds_key.replace(".down.weight", ".up.weight"))
|
||||
|
||||
# calculate dims if not provided
|
||||
num_splits = len(ait_keys)
|
||||
if dims is None:
|
||||
dims = [up_weight.shape[0] // num_splits] * num_splits
|
||||
else:
|
||||
assert sum(dims) == up_weight.shape[0]
|
||||
|
||||
# make ai-toolkit weight
|
||||
ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
|
||||
ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
|
||||
|
||||
# down_weight is copied to each split
|
||||
ait_sd.update({k: down_weight for k in ait_down_keys})
|
||||
|
||||
# up_weight is split to each split
|
||||
ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))}) # noqa: C416
|
||||
|
||||
for old_key in orig_keys:
|
||||
# Handle double_blocks
|
||||
if old_key.startswith(("diffusion_model.double_blocks", "double_blocks")):
|
||||
block_num = re.search(r"double_blocks\.(\d+)", old_key).group(1)
|
||||
new_key = f"transformer.transformer_blocks.{block_num}"
|
||||
|
||||
if "processor.proj_lora1" in old_key:
|
||||
new_key += ".attn.to_out.0"
|
||||
elif "processor.proj_lora2" in old_key:
|
||||
new_key += ".attn.to_add_out"
|
||||
# Handle text latents.
|
||||
elif "processor.qkv_lora2" in old_key and "up" not in old_key:
|
||||
handle_qkv(
|
||||
old_state_dict,
|
||||
new_state_dict,
|
||||
old_key,
|
||||
[
|
||||
f"transformer.transformer_blocks.{block_num}.attn.add_q_proj",
|
||||
f"transformer.transformer_blocks.{block_num}.attn.add_k_proj",
|
||||
f"transformer.transformer_blocks.{block_num}.attn.add_v_proj",
|
||||
],
|
||||
)
|
||||
# continue
|
||||
# Handle image latents.
|
||||
elif "processor.qkv_lora1" in old_key and "up" not in old_key:
|
||||
handle_qkv(
|
||||
old_state_dict,
|
||||
new_state_dict,
|
||||
old_key,
|
||||
[
|
||||
f"transformer.transformer_blocks.{block_num}.attn.to_q",
|
||||
f"transformer.transformer_blocks.{block_num}.attn.to_k",
|
||||
f"transformer.transformer_blocks.{block_num}.attn.to_v",
|
||||
],
|
||||
)
|
||||
# continue
|
||||
|
||||
if "down" in old_key:
|
||||
new_key += ".lora_A.weight"
|
||||
elif "up" in old_key:
|
||||
new_key += ".lora_B.weight"
|
||||
|
||||
# Handle single_blocks
|
||||
elif old_key.startswith("diffusion_model.single_blocks", "single_blocks"):
|
||||
block_num = re.search(r"single_blocks\.(\d+)", old_key).group(1)
|
||||
new_key = f"transformer.single_transformer_blocks.{block_num}"
|
||||
|
||||
if "proj_lora1" in old_key or "proj_lora2" in old_key:
|
||||
new_key += ".proj_out"
|
||||
elif "qkv_lora1" in old_key or "qkv_lora2" in old_key:
|
||||
new_key += ".norm.linear"
|
||||
|
||||
if "down" in old_key:
|
||||
new_key += ".lora_A.weight"
|
||||
elif "up" in old_key:
|
||||
new_key += ".lora_B.weight"
|
||||
|
||||
else:
|
||||
# Handle other potential key patterns here
|
||||
new_key = old_key
|
||||
|
||||
# Since we already handle qkv above.
|
||||
if "qkv" not in old_key:
|
||||
new_state_dict[new_key] = old_state_dict.pop(old_key)
|
||||
|
||||
if len(old_state_dict) > 0:
|
||||
raise ValueError(f"`old_state_dict` should be at this point but has: {list(old_state_dict.keys())}.")
|
||||
|
||||
return new_state_dict
|
||||
|
||||
@@ -31,12 +31,7 @@ from ..utils import (
|
||||
scale_lora_layers,
|
||||
)
|
||||
from .lora_base import LoraBaseMixin
|
||||
from .lora_conversion_utils import (
|
||||
_convert_kohya_flux_lora_to_diffusers,
|
||||
_convert_non_diffusers_lora_to_diffusers,
|
||||
_convert_xlabs_flux_lora_to_diffusers,
|
||||
_maybe_map_sgm_blocks_to_diffusers,
|
||||
)
|
||||
from .lora_conversion_utils import _convert_non_diffusers_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers
|
||||
|
||||
|
||||
if is_transformers_available():
|
||||
@@ -1588,20 +1583,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
allow_pickle=allow_pickle,
|
||||
)
|
||||
|
||||
# TODO (sayakpaul): to a follow-up to clean and try to unify the conditions.
|
||||
|
||||
is_kohya = any(".lora_down.weight" in k for k in state_dict)
|
||||
if is_kohya:
|
||||
state_dict = _convert_kohya_flux_lora_to_diffusers(state_dict)
|
||||
# Kohya already takes care of scaling the LoRA parameters with alpha.
|
||||
return (state_dict, None) if return_alphas else state_dict
|
||||
|
||||
is_xlabs = any("processor" in k for k in state_dict)
|
||||
if is_xlabs:
|
||||
state_dict = _convert_xlabs_flux_lora_to_diffusers(state_dict)
|
||||
# xlabs doesn't use `alpha`.
|
||||
return (state_dict, None) if return_alphas else state_dict
|
||||
|
||||
# For state dicts like
|
||||
# https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA
|
||||
keys = list(state_dict.keys())
|
||||
|
||||
@@ -91,11 +91,11 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
||||
"xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"},
|
||||
"playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"},
|
||||
"upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"},
|
||||
"inpainting": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8-inpainting"},
|
||||
"inpainting": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-inpainting"},
|
||||
"inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
|
||||
"controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
|
||||
"v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
|
||||
"v1": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8"},
|
||||
"v1": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5"},
|
||||
"stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
|
||||
"stable_cascade_stage_b_lite": {
|
||||
"pretrained_model_name_or_path": "stabilityai/stable-cascade",
|
||||
|
||||
@@ -35,7 +35,7 @@ if is_torch_available():
|
||||
_import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
|
||||
_import_structure["autoencoders.vq_model"] = ["VQModel"]
|
||||
_import_structure["controlnet"] = ["ControlNetModel"]
|
||||
_import_structure["controlnet_flux"] = ["FluxControlNetModel", "FluxMultiControlNetModel"]
|
||||
_import_structure["controlnet_flux"] = ["FluxControlNetModel"]
|
||||
_import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"]
|
||||
_import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
|
||||
_import_structure["controlnet_sparsectrl"] = ["SparseControlNetModel"]
|
||||
@@ -88,7 +88,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
VQModel,
|
||||
)
|
||||
from .controlnet import ControlNetModel
|
||||
from .controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
|
||||
from .controlnet_flux import FluxControlNetModel
|
||||
from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
|
||||
from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
|
||||
from .controlnet_sparsectrl import SparseControlNetModel
|
||||
|
||||
@@ -972,32 +972,15 @@ class FreeNoiseTransformerBlock(nn.Module):
|
||||
return frame_indices
|
||||
|
||||
def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
|
||||
if weighting_scheme == "flat":
|
||||
weights = [1.0] * num_frames
|
||||
|
||||
elif weighting_scheme == "pyramid":
|
||||
if weighting_scheme == "pyramid":
|
||||
if num_frames % 2 == 0:
|
||||
# num_frames = 4 => [1, 2, 2, 1]
|
||||
mid = num_frames // 2
|
||||
weights = list(range(1, mid + 1))
|
||||
weights = list(range(1, num_frames // 2 + 1))
|
||||
weights = weights + weights[::-1]
|
||||
else:
|
||||
# num_frames = 5 => [1, 2, 3, 2, 1]
|
||||
mid = (num_frames + 1) // 2
|
||||
weights = list(range(1, mid))
|
||||
weights = weights + [mid] + weights[::-1]
|
||||
|
||||
elif weighting_scheme == "delayed_reverse_sawtooth":
|
||||
if num_frames % 2 == 0:
|
||||
# num_frames = 4 => [0.01, 2, 2, 1]
|
||||
mid = num_frames // 2
|
||||
weights = [0.01] * (mid - 1) + [mid]
|
||||
weights = weights + list(range(mid, 0, -1))
|
||||
else:
|
||||
# num_frames = 5 => [0.01, 0.01, 3, 2, 1]
|
||||
mid = (num_frames + 1) // 2
|
||||
weights = [0.01] * mid
|
||||
weights = weights + list(range(mid, 0, -1))
|
||||
weights = list(range(1, num_frames // 2 + 1))
|
||||
weights = weights + [num_frames // 2 + 1] + weights[::-1]
|
||||
else:
|
||||
raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
|
||||
|
||||
@@ -1054,8 +1037,9 @@ class FreeNoiseTransformerBlock(nn.Module):
|
||||
accumulated_values = torch.zeros_like(hidden_states)
|
||||
|
||||
for i, (frame_start, frame_end) in enumerate(frame_indices):
|
||||
# The reason for slicing here is to handle cases like frame_indices=[(0, 16), (16, 20)],
|
||||
# if the user provided a video with 19 frames, or essentially a non-multiple of `context_length`.
|
||||
# The reason for slicing here is to ensure that if (frame_end - frame_start) is to handle
|
||||
# cases like frame_indices=[(0, 16), (16, 20)], if the user provided a video with 19 frames, or
|
||||
# essentially a non-multiple of `context_length`.
|
||||
weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
|
||||
weights *= frame_weights
|
||||
|
||||
@@ -1103,26 +1087,8 @@ class FreeNoiseTransformerBlock(nn.Module):
|
||||
accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
|
||||
num_times_accumulated[:, frame_start:frame_end] += weights
|
||||
|
||||
# TODO(aryan): Maybe this could be done in a better way.
|
||||
#
|
||||
# Previously, this was:
|
||||
# hidden_states = torch.where(
|
||||
# num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
|
||||
# )
|
||||
#
|
||||
# The reasoning for the change here is `torch.where` became a bottleneck at some point when golfing memory
|
||||
# spikes. It is particularly noticeable when the number of frames is high. My understanding is that this comes
|
||||
# from tensors being copied - which is why we resort to spliting and concatenating here. I've not particularly
|
||||
# looked into this deeply because other memory optimizations led to more pronounced reductions.
|
||||
hidden_states = torch.cat(
|
||||
[
|
||||
torch.where(num_times_split > 0, accumulated_split / num_times_split, accumulated_split)
|
||||
for accumulated_split, num_times_split in zip(
|
||||
accumulated_values.split(self.context_length, dim=1),
|
||||
num_times_accumulated.split(self.context_length, dim=1),
|
||||
)
|
||||
],
|
||||
dim=1,
|
||||
hidden_states = torch.where(
|
||||
num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
|
||||
).to(dtype)
|
||||
|
||||
# 3. Feed-forward
|
||||
|
||||
@@ -999,7 +999,6 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
# setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
|
||||
# number of temporal frames.
|
||||
self.num_latent_frames_batch_size = 2
|
||||
self.num_sample_frames_batch_size = 8
|
||||
|
||||
# We make the minimum height and width of sample for tiling half that of the generally supported
|
||||
self.tile_sample_min_height = sample_height // 2
|
||||
@@ -1082,29 +1081,6 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
"""
|
||||
self.use_slicing = False
|
||||
|
||||
def _encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, num_channels, num_frames, height, width = x.shape
|
||||
|
||||
if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
|
||||
return self.tiled_encode(x)
|
||||
|
||||
frame_batch_size = self.num_sample_frames_batch_size
|
||||
enc = []
|
||||
for i in range(num_frames // frame_batch_size):
|
||||
remaining_frames = num_frames % frame_batch_size
|
||||
start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
|
||||
end_frame = frame_batch_size * (i + 1) + remaining_frames
|
||||
x_intermediate = x[:, :, start_frame:end_frame]
|
||||
x_intermediate = self.encoder(x_intermediate)
|
||||
if self.quant_conv is not None:
|
||||
x_intermediate = self.quant_conv(x_intermediate)
|
||||
enc.append(x_intermediate)
|
||||
|
||||
self._clear_fake_context_parallel_cache()
|
||||
enc = torch.cat(enc, dim=2)
|
||||
|
||||
return enc
|
||||
|
||||
@apply_forward_hook
|
||||
def encode(
|
||||
self, x: torch.Tensor, return_dict: bool = True
|
||||
@@ -1118,17 +1094,13 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
The latent representations of the encoded videos. If `return_dict` is True, a
|
||||
The latent representations of the encoded images. If `return_dict` is True, a
|
||||
[`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
|
||||
"""
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
|
||||
h = torch.cat(encoded_slices)
|
||||
else:
|
||||
h = self._encode(x)
|
||||
|
||||
h = self.encoder(x)
|
||||
if self.quant_conv is not None:
|
||||
h = self.quant_conv(h)
|
||||
posterior = DiagonalGaussianDistribution(h)
|
||||
|
||||
if not return_dict:
|
||||
return (posterior,)
|
||||
return AutoencoderKLOutput(latent_dist=posterior)
|
||||
@@ -1200,75 +1172,6 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
)
|
||||
return b
|
||||
|
||||
def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
r"""Encode a batch of images using a tiled encoder.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
|
||||
steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
|
||||
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
|
||||
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
|
||||
output, but they should be much less noticeable.
|
||||
|
||||
Args:
|
||||
x (`torch.Tensor`): Input batch of videos.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The latent representation of the encoded videos.
|
||||
"""
|
||||
# For a rough memory estimate, take a look at the `tiled_decode` method.
|
||||
batch_size, num_channels, num_frames, height, width = x.shape
|
||||
|
||||
overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
|
||||
overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
|
||||
blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
|
||||
blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
|
||||
row_limit_height = self.tile_latent_min_height - blend_extent_height
|
||||
row_limit_width = self.tile_latent_min_width - blend_extent_width
|
||||
frame_batch_size = self.num_sample_frames_batch_size
|
||||
|
||||
# Split x into overlapping tiles and encode them separately.
|
||||
# The tiles have an overlap to avoid seams between tiles.
|
||||
rows = []
|
||||
for i in range(0, height, overlap_height):
|
||||
row = []
|
||||
for j in range(0, width, overlap_width):
|
||||
time = []
|
||||
for k in range(num_frames // frame_batch_size):
|
||||
remaining_frames = num_frames % frame_batch_size
|
||||
start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
|
||||
end_frame = frame_batch_size * (k + 1) + remaining_frames
|
||||
tile = x[
|
||||
:,
|
||||
:,
|
||||
start_frame:end_frame,
|
||||
i : i + self.tile_sample_min_height,
|
||||
j : j + self.tile_sample_min_width,
|
||||
]
|
||||
tile = self.encoder(tile)
|
||||
if self.quant_conv is not None:
|
||||
tile = self.quant_conv(tile)
|
||||
time.append(tile)
|
||||
self._clear_fake_context_parallel_cache()
|
||||
row.append(torch.cat(time, dim=2))
|
||||
rows.append(row)
|
||||
|
||||
result_rows = []
|
||||
for i, row in enumerate(rows):
|
||||
result_row = []
|
||||
for j, tile in enumerate(row):
|
||||
# blend the above tile and the left tile
|
||||
# to the current tile and add the current tile to the result row
|
||||
if i > 0:
|
||||
tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
|
||||
if j > 0:
|
||||
tile = self.blend_h(row[j - 1], tile, blend_extent_width)
|
||||
result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
|
||||
result_rows.append(torch.cat(result_row, dim=4))
|
||||
|
||||
enc = torch.cat(result_rows, dim=3)
|
||||
return enc
|
||||
|
||||
def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
|
||||
r"""
|
||||
Decode a batch of images using a tiled decoder.
|
||||
|
||||
@@ -54,7 +54,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
pooled_projection_dim: int = 768,
|
||||
guidance_embeds: bool = False,
|
||||
axes_dims_rope: List[int] = [16, 56, 56],
|
||||
num_mode: int = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.out_channels = in_channels
|
||||
@@ -102,10 +101,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
for _ in range(len(self.single_transformer_blocks)):
|
||||
self.controlnet_single_blocks.append(zero_module(nn.Linear(self.inner_dim, self.inner_dim)))
|
||||
|
||||
self.union = num_mode is not None
|
||||
if self.union:
|
||||
self.controlnet_mode_embedder = nn.Embedding(num_mode, self.inner_dim)
|
||||
|
||||
self.controlnet_x_embedder = zero_module(torch.nn.Linear(in_channels, self.inner_dim))
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
@@ -178,8 +173,8 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
def from_transformer(
|
||||
cls,
|
||||
transformer,
|
||||
num_layers: int = 4,
|
||||
num_single_layers: int = 10,
|
||||
num_layers=4,
|
||||
num_single_layers=10,
|
||||
attention_head_dim: int = 128,
|
||||
num_attention_heads: int = 24,
|
||||
load_weights_from_transformer=True,
|
||||
@@ -210,7 +205,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
controlnet_cond: torch.Tensor,
|
||||
controlnet_mode: torch.Tensor = None,
|
||||
conditioning_scale: float = 1.0,
|
||||
encoder_hidden_states: torch.Tensor = None,
|
||||
pooled_projections: torch.Tensor = None,
|
||||
@@ -227,12 +221,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
||||
Input `hidden_states`.
|
||||
controlnet_cond (`torch.Tensor`):
|
||||
The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
controlnet_mode (`torch.Tensor`):
|
||||
The mode tensor of shape `(batch_size, 1)`.
|
||||
conditioning_scale (`float`, defaults to `1.0`):
|
||||
The scale factor for ControlNet outputs.
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
|
||||
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
||||
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
|
||||
@@ -284,15 +272,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
)
|
||||
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
|
||||
|
||||
if self.union:
|
||||
# union mode
|
||||
if controlnet_mode is None:
|
||||
raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
|
||||
# union mode emb
|
||||
controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
|
||||
encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
|
||||
txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
|
||||
|
||||
if txt_ids.ndim == 3:
|
||||
logger.warning(
|
||||
"Passing `txt_ids` 3d torch.Tensor is deprecated."
|
||||
@@ -388,6 +367,7 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
controlnet_block_samples = [sample * conditioning_scale for sample in controlnet_block_samples]
|
||||
controlnet_single_block_samples = [sample * conditioning_scale for sample in controlnet_single_block_samples]
|
||||
|
||||
#
|
||||
controlnet_block_samples = None if len(controlnet_block_samples) == 0 else controlnet_block_samples
|
||||
controlnet_single_block_samples = (
|
||||
None if len(controlnet_single_block_samples) == 0 else controlnet_single_block_samples
|
||||
@@ -404,114 +384,3 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
controlnet_block_samples=controlnet_block_samples,
|
||||
controlnet_single_block_samples=controlnet_single_block_samples,
|
||||
)
|
||||
|
||||
|
||||
class FluxMultiControlNetModel(ModelMixin):
|
||||
r"""
|
||||
`FluxMultiControlNetModel` wrapper class for Multi-FluxControlNetModel
|
||||
|
||||
This module is a wrapper for multiple instances of the `FluxControlNetModel`. The `forward()` API is designed to be
|
||||
compatible with `FluxControlNetModel`.
|
||||
|
||||
Args:
|
||||
controlnets (`List[FluxControlNetModel]`):
|
||||
Provides additional conditioning to the unet during the denoising process. You must set multiple
|
||||
`FluxControlNetModel` as a list.
|
||||
"""
|
||||
|
||||
def __init__(self, controlnets):
|
||||
super().__init__()
|
||||
self.nets = nn.ModuleList(controlnets)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.FloatTensor,
|
||||
controlnet_cond: List[torch.tensor],
|
||||
controlnet_mode: List[torch.tensor],
|
||||
conditioning_scale: List[float],
|
||||
encoder_hidden_states: torch.Tensor = None,
|
||||
pooled_projections: torch.Tensor = None,
|
||||
timestep: torch.LongTensor = None,
|
||||
img_ids: torch.Tensor = None,
|
||||
txt_ids: torch.Tensor = None,
|
||||
guidance: torch.Tensor = None,
|
||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[FluxControlNetOutput, Tuple]:
|
||||
# ControlNet-Union with multiple conditions
|
||||
# only load one ControlNet for saving memories
|
||||
if len(self.nets) == 1 and self.nets[0].union:
|
||||
controlnet = self.nets[0]
|
||||
|
||||
for i, (image, mode, scale) in enumerate(zip(controlnet_cond, controlnet_mode, conditioning_scale)):
|
||||
block_samples, single_block_samples = controlnet(
|
||||
hidden_states=hidden_states,
|
||||
controlnet_cond=image,
|
||||
controlnet_mode=mode[:, None],
|
||||
conditioning_scale=scale,
|
||||
timestep=timestep,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_projections,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
txt_ids=txt_ids,
|
||||
img_ids=img_ids,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
# merge samples
|
||||
if i == 0:
|
||||
control_block_samples = block_samples
|
||||
control_single_block_samples = single_block_samples
|
||||
else:
|
||||
control_block_samples = [
|
||||
control_block_sample + block_sample
|
||||
for control_block_sample, block_sample in zip(control_block_samples, block_samples)
|
||||
]
|
||||
|
||||
control_single_block_samples = [
|
||||
control_single_block_sample + block_sample
|
||||
for control_single_block_sample, block_sample in zip(
|
||||
control_single_block_samples, single_block_samples
|
||||
)
|
||||
]
|
||||
|
||||
# Regular Multi-ControlNets
|
||||
# load all ControlNets into memories
|
||||
else:
|
||||
for i, (image, mode, scale, controlnet) in enumerate(
|
||||
zip(controlnet_cond, controlnet_mode, conditioning_scale, self.nets)
|
||||
):
|
||||
block_samples, single_block_samples = controlnet(
|
||||
hidden_states=hidden_states,
|
||||
controlnet_cond=image,
|
||||
controlnet_mode=mode[:, None],
|
||||
conditioning_scale=scale,
|
||||
timestep=timestep,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_projections,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
txt_ids=txt_ids,
|
||||
img_ids=img_ids,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
# merge samples
|
||||
if i == 0:
|
||||
control_block_samples = block_samples
|
||||
control_single_block_samples = single_block_samples
|
||||
else:
|
||||
control_block_samples = [
|
||||
control_block_sample + block_sample
|
||||
for control_block_sample, block_sample in zip(control_block_samples, block_samples)
|
||||
]
|
||||
|
||||
control_single_block_samples = [
|
||||
control_single_block_sample + block_sample
|
||||
for control_single_block_sample, block_sample in zip(
|
||||
control_single_block_samples, single_block_samples
|
||||
)
|
||||
]
|
||||
|
||||
return control_block_samples, control_single_block_samples
|
||||
|
||||
@@ -242,12 +242,9 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
@classmethod
|
||||
def from_transformer(
|
||||
cls, transformer, num_layers=12, num_extra_conditioning_channels=1, load_weights_from_transformer=True
|
||||
):
|
||||
def from_transformer(cls, transformer, num_layers=12, load_weights_from_transformer=True):
|
||||
config = transformer.config
|
||||
config["num_layers"] = num_layers or config.num_layers
|
||||
config["extra_conditioning_channels"] = num_extra_conditioning_channels
|
||||
controlnet = cls(**config)
|
||||
|
||||
if load_weights_from_transformer:
|
||||
|
||||
@@ -691,6 +691,7 @@ class SparseControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
|
||||
emb = self.time_embedding(t_emb, timestep_cond)
|
||||
emb = emb.repeat_interleave(sample_num_frames, dim=0)
|
||||
encoder_hidden_states = encoder_hidden_states.repeat_interleave(sample_num_frames, dim=0)
|
||||
|
||||
# 2. pre-process
|
||||
batch_size, channels, num_frames, height, width = sample.shape
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -342,55 +342,15 @@ class CogVideoXPatchEmbed(nn.Module):
|
||||
embed_dim: int = 1920,
|
||||
text_embed_dim: int = 4096,
|
||||
bias: bool = True,
|
||||
sample_width: int = 90,
|
||||
sample_height: int = 60,
|
||||
sample_frames: int = 49,
|
||||
temporal_compression_ratio: int = 4,
|
||||
max_text_seq_length: int = 226,
|
||||
spatial_interpolation_scale: float = 1.875,
|
||||
temporal_interpolation_scale: float = 1.0,
|
||||
use_positional_embeddings: bool = True,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.in_channels = in_channels
|
||||
self.embed_dim = embed_dim
|
||||
self.text_embed_dim = text_embed_dim
|
||||
self.bias = bias
|
||||
self.sample_width = sample_width
|
||||
self.sample_height = sample_height
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
|
||||
self.proj = nn.Conv2d(
|
||||
in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
|
||||
)
|
||||
self.text_proj = nn.Linear(text_embed_dim, embed_dim)
|
||||
|
||||
if use_positional_embeddings:
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=False)
|
||||
|
||||
def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.embed_dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
)
|
||||
pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
|
||||
return pos_embedding
|
||||
|
||||
def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
|
||||
r"""
|
||||
Args:
|
||||
@@ -406,124 +366,12 @@ class CogVideoXPatchEmbed(nn.Module):
|
||||
image_embeds = self.proj(image_embeds)
|
||||
image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels]
|
||||
image_embeds = image_embeds.flatten(1, 2).contiguous() # [batch, num_frames x height x width, channels]
|
||||
image_embeds = image_embeds.flatten(1, 2) # [batch, num_frames x height x width, channels]
|
||||
|
||||
if self.use_positional_embeddings:
|
||||
pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
|
||||
if (
|
||||
self.sample_height != height
|
||||
or self.sample_width != width
|
||||
or self.sample_frames != pre_time_compression_frames
|
||||
):
|
||||
pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
|
||||
pos_embedding = pos_embedding.to(image_embeds.device, dtype=image_embeds.dtype)
|
||||
else:
|
||||
pos_embedding = self.pos_embedding
|
||||
|
||||
image_embeds = image_embeds + pos_embedding
|
||||
|
||||
return text_embeds, image_embeds
|
||||
|
||||
|
||||
class FreeNoiseCogVideoXPatchEmbed(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 16,
|
||||
embed_dim: int = 1920,
|
||||
text_embed_dim: int = 4096,
|
||||
bias: bool = True,
|
||||
sample_width: int = 90,
|
||||
sample_height: int = 60,
|
||||
sample_frames: int = 49,
|
||||
temporal_compression_ratio: int = 4,
|
||||
max_text_seq_length: int = 226,
|
||||
spatial_interpolation_scale: float = 1.875,
|
||||
temporal_interpolation_scale: float = 1.0,
|
||||
use_positional_embeddings: bool = True,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.in_channels = in_channels
|
||||
self.embed_dim = embed_dim
|
||||
self.text_embed_dim = text_embed_dim
|
||||
self.bias = bias
|
||||
self.sample_width = sample_width
|
||||
self.sample_height = sample_height
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
|
||||
self.proj = nn.Conv2d(
|
||||
in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
|
||||
)
|
||||
self.text_proj = nn.Linear(text_embed_dim, embed_dim)
|
||||
|
||||
if use_positional_embeddings:
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=False)
|
||||
|
||||
# Copied from diffusers.models.embeddings.CogVideoXPatchEmbed
|
||||
def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.embed_dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
)
|
||||
pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
|
||||
return pos_embedding
|
||||
|
||||
def forward(self, text_embeds: Union[torch.Tensor, Tuple[Dict[int, torch.Tensor]]], image_embeds: torch.Tensor):
|
||||
r"""
|
||||
Args:
|
||||
text_embeds (`torch.Tensor`):
|
||||
Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
|
||||
image_embeds (`torch.Tensor`):
|
||||
Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
|
||||
"""
|
||||
if isinstance(text_embeds, torch.Tensor):
|
||||
text_embeds = self.text_proj(text_embeds)
|
||||
else:
|
||||
assert isinstance(text_embeds, tuple)
|
||||
text_embeds_output = []
|
||||
for tuple_index in range(len(text_embeds)):
|
||||
text_embeds_output.append({})
|
||||
for key, text_embed in list(text_embeds[tuple_index].items()):
|
||||
text_embeds_output[tuple_index][key] = self.text_proj(text_embed)
|
||||
text_embeds = tuple(text_embeds_output)
|
||||
|
||||
batch, num_frames, channels, height, width = image_embeds.shape
|
||||
image_embeds = image_embeds.reshape(-1, channels, height, width)
|
||||
image_embeds = self.proj(image_embeds)
|
||||
image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels]
|
||||
image_embeds = image_embeds.flatten(1, 2).contiguous() # [batch, num_frames x height x width, channels]
|
||||
|
||||
if self.use_positional_embeddings:
|
||||
pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
|
||||
if (
|
||||
self.sample_height != height
|
||||
or self.sample_width != width
|
||||
or self.sample_frames != pre_time_compression_frames
|
||||
):
|
||||
pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
|
||||
pos_embedding = pos_embedding.to(image_embeds.device, dtype=image_embeds.dtype)
|
||||
else:
|
||||
pos_embedding = self.pos_embedding
|
||||
|
||||
image_embeds = image_embeds + pos_embedding
|
||||
|
||||
return text_embeds, image_embeds
|
||||
embeds = torch.cat(
|
||||
[text_embeds, image_embeds], dim=1
|
||||
).contiguous() # [batch, seq_length + num_frames x height x width, channels]
|
||||
return embeds
|
||||
|
||||
|
||||
def get_3d_rotary_pos_embed(
|
||||
@@ -543,16 +391,15 @@ def get_3d_rotary_pos_embed(
|
||||
The size of the temporal dimension.
|
||||
theta (`float`):
|
||||
Scaling factor for frequency computation.
|
||||
use_real (`bool`):
|
||||
If True, return real part and imaginary part separately. Otherwise, return complex numbers.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
|
||||
"""
|
||||
if use_real is not True:
|
||||
raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
|
||||
start, stop = crops_coords
|
||||
grid_size_h, grid_size_w = grid_size
|
||||
grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
|
||||
grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
|
||||
grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
|
||||
grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
|
||||
grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
|
||||
|
||||
# Compute dimensions for each axis
|
||||
@@ -561,37 +408,54 @@ def get_3d_rotary_pos_embed(
|
||||
dim_w = embed_dim // 8 * 3
|
||||
|
||||
# Temporal frequencies
|
||||
freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
|
||||
freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
|
||||
grid_t = torch.from_numpy(grid_t).float()
|
||||
freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
|
||||
freqs_t = freqs_t.repeat_interleave(2, dim=-1)
|
||||
|
||||
# Spatial frequencies for height and width
|
||||
freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
|
||||
freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
|
||||
freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
|
||||
freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
|
||||
grid_h = torch.from_numpy(grid_h).float()
|
||||
grid_w = torch.from_numpy(grid_w).float()
|
||||
freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
|
||||
freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
|
||||
freqs_h = freqs_h.repeat_interleave(2, dim=-1)
|
||||
freqs_w = freqs_w.repeat_interleave(2, dim=-1)
|
||||
|
||||
# BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
|
||||
def combine_time_height_width(freqs_t, freqs_h, freqs_w):
|
||||
freqs_t = freqs_t[:, None, None, :].expand(
|
||||
-1, grid_size_h, grid_size_w, -1
|
||||
) # temporal_size, grid_size_h, grid_size_w, dim_t
|
||||
freqs_h = freqs_h[None, :, None, :].expand(
|
||||
temporal_size, -1, grid_size_w, -1
|
||||
) # temporal_size, grid_size_h, grid_size_2, dim_h
|
||||
freqs_w = freqs_w[None, None, :, :].expand(
|
||||
temporal_size, grid_size_h, -1, -1
|
||||
) # temporal_size, grid_size_h, grid_size_2, dim_w
|
||||
# Broadcast and concatenate tensors along specified dimension
|
||||
def broadcast(tensors, dim=-1):
|
||||
num_tensors = len(tensors)
|
||||
shape_lens = {len(t.shape) for t in tensors}
|
||||
assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
|
||||
shape_len = list(shape_lens)[0]
|
||||
dim = (dim + shape_len) if dim < 0 else dim
|
||||
dims = list(zip(*(list(t.shape) for t in tensors)))
|
||||
expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
|
||||
assert all(
|
||||
[*(len(set(t[1])) <= 2 for t in expandable_dims)]
|
||||
), "invalid dimensions for broadcastable concatenation"
|
||||
max_dims = [(t[0], max(t[1])) for t in expandable_dims]
|
||||
expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
|
||||
expanded_dims.insert(dim, (dim, dims[dim]))
|
||||
expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
|
||||
tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
|
||||
return torch.cat(tensors, dim=dim)
|
||||
|
||||
freqs = torch.cat(
|
||||
[freqs_t, freqs_h, freqs_w], dim=-1
|
||||
) # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
|
||||
freqs = freqs.view(
|
||||
temporal_size * grid_size_h * grid_size_w, -1
|
||||
) # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
|
||||
return freqs
|
||||
freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
|
||||
|
||||
t_cos, t_sin = freqs_t # both t_cos and t_sin has shape: temporal_size, dim_t
|
||||
h_cos, h_sin = freqs_h # both h_cos and h_sin has shape: grid_size_h, dim_h
|
||||
w_cos, w_sin = freqs_w # both w_cos and w_sin has shape: grid_size_w, dim_w
|
||||
cos = combine_time_height_width(t_cos, h_cos, w_cos)
|
||||
sin = combine_time_height_width(t_sin, h_sin, w_sin)
|
||||
return cos, sin
|
||||
t, h, w, d = freqs.shape
|
||||
freqs = freqs.view(t * h * w, d)
|
||||
|
||||
# Generate sine and cosine components
|
||||
sin = freqs.sin()
|
||||
cos = freqs.cos()
|
||||
|
||||
if use_real:
|
||||
return cos, sin
|
||||
else:
|
||||
freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
|
||||
return freqs_cis
|
||||
|
||||
|
||||
def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True):
|
||||
@@ -666,7 +530,7 @@ def get_1d_rotary_pos_embed(
|
||||
linear_factor=1.0,
|
||||
ntk_factor=1.0,
|
||||
repeat_interleave_real=True,
|
||||
freqs_dtype=torch.float32, # torch.float32, torch.float64 (flux)
|
||||
freqs_dtype=torch.float32, # torch.float32 (hunyuan, stable audio), torch.float64 (flux)
|
||||
):
|
||||
"""
|
||||
Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
|
||||
@@ -697,30 +561,21 @@ def get_1d_rotary_pos_embed(
|
||||
assert dim % 2 == 0
|
||||
|
||||
if isinstance(pos, int):
|
||||
pos = torch.arange(pos)
|
||||
if isinstance(pos, np.ndarray):
|
||||
pos = torch.from_numpy(pos) # type: ignore # [S]
|
||||
|
||||
pos = np.arange(pos)
|
||||
theta = theta * ntk_factor
|
||||
freqs = (
|
||||
1.0
|
||||
/ (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
|
||||
/ linear_factor
|
||||
) # [D/2]
|
||||
freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
|
||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor # [D/2]
|
||||
t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S]
|
||||
freqs = torch.outer(t, freqs) # type: ignore # [S, D/2]
|
||||
if use_real and repeat_interleave_real:
|
||||
# flux, hunyuan-dit, cogvideox
|
||||
freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
|
||||
freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
|
||||
return freqs_cos, freqs_sin
|
||||
elif use_real:
|
||||
# stable audio
|
||||
freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float() # [S, D]
|
||||
freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float() # [S, D]
|
||||
return freqs_cos, freqs_sin
|
||||
else:
|
||||
# lumina
|
||||
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]
|
||||
freqs_cis = torch.polar(torch.ones_like(freqs), freqs).float() # complex64 # [S, D/2]
|
||||
return freqs_cis
|
||||
|
||||
|
||||
@@ -751,11 +606,11 @@ def apply_rotary_emb(
|
||||
cos, sin = cos.to(x.device), sin.to(x.device)
|
||||
|
||||
if use_real_unbind_dim == -1:
|
||||
# Used for flux, cogvideox, hunyuan-dit
|
||||
# Use for example in Lumina
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
|
||||
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
||||
elif use_real_unbind_dim == -2:
|
||||
# Used for Stable Audio
|
||||
# Use for example in Stable Audio
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
|
||||
x_rotated = torch.cat([-x_imag, x_real], dim=-1)
|
||||
else:
|
||||
@@ -765,7 +620,6 @@ def apply_rotary_emb(
|
||||
|
||||
return out
|
||||
else:
|
||||
# used for lumina
|
||||
x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
|
||||
freqs_cis = freqs_cis.unsqueeze(2)
|
||||
x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
|
||||
@@ -784,7 +638,7 @@ class FluxPosEmbed(nn.Module):
|
||||
n_axes = ids.shape[-1]
|
||||
cos_out = []
|
||||
sin_out = []
|
||||
pos = ids.float()
|
||||
pos = ids.squeeze().float().cpu().numpy()
|
||||
is_mps = ids.device.type == "mps"
|
||||
freqs_dtype = torch.float32 if is_mps else torch.float64
|
||||
for i in range(n_axes):
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@@ -23,7 +23,7 @@ from ...utils import is_torch_version, logging
|
||||
from ...utils.torch_utils import maybe_allow_in_graph
|
||||
from ..attention import Attention, FeedForward
|
||||
from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
|
||||
from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
|
||||
from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
|
||||
from ..modeling_outputs import Transformer2DModelOutput
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..normalization import AdaLayerNorm, CogVideoXLayerNormZero
|
||||
@@ -86,20 +86,6 @@ class CogVideoXBlock(nn.Module):
|
||||
attention_out_bias: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.time_embed_dim = time_embed_dim
|
||||
self.dropout = dropout
|
||||
self.activation_fn = activation_fn
|
||||
self.attention_bias = attention_bias
|
||||
self.qk_norm = qk_norm
|
||||
self.norm_elementwise_affine = norm_elementwise_affine
|
||||
self.norm_eps = norm_eps
|
||||
self.final_dropout = final_dropout
|
||||
self.ff_inner_dim = ff_inner_dim
|
||||
self.ff_bias = ff_bias
|
||||
self.attention_out_bias = attention_out_bias
|
||||
|
||||
# 1. Self Attention
|
||||
self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
|
||||
@@ -133,7 +119,6 @@ class CogVideoXBlock(nn.Module):
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
temb: torch.Tensor,
|
||||
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
num_frames: int = None,
|
||||
) -> torch.Tensor:
|
||||
text_seq_length = encoder_hidden_states.size(1)
|
||||
|
||||
@@ -167,410 +152,6 @@ class CogVideoXBlock(nn.Module):
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
# norm_final is just nn.LayerNorm, all ops will be on channel dimension, so we dont have to care about frame dimension
|
||||
# proj_out is also just along channel dimension
|
||||
# norm_out is just linear and nn.LayerNorm, again only on channel dimension, so we dont care about frame dims
|
||||
# same story with norm1, norm2 and ff
|
||||
# patch embed layer just applies on channel dim too and condenses to [B, FHW, C]
|
||||
# only attention layer seems to be actually doing anything with the frame dimension and so only location where FreeNoise needs to be applied
|
||||
# Since it does not matter for norm1, norm2, ff, and they might create memory bottleneck, just use FreeNoise frame split on them too
|
||||
|
||||
|
||||
@maybe_allow_in_graph
|
||||
class FreeNoiseCogVideoXBlock(nn.Module):
|
||||
r"""
|
||||
FreeNoise block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
|
||||
|
||||
Parameters:
|
||||
dim (`int`):
|
||||
The number of channels in the input and output.
|
||||
num_attention_heads (`int`):
|
||||
The number of heads to use for multi-head attention.
|
||||
attention_head_dim (`int`):
|
||||
The number of channels in each head.
|
||||
time_embed_dim (`int`):
|
||||
The number of channels in timestep embedding.
|
||||
dropout (`float`, defaults to `0.0`):
|
||||
The dropout probability to use.
|
||||
activation_fn (`str`, defaults to `"gelu-approximate"`):
|
||||
Activation function to be used in feed-forward.
|
||||
attention_bias (`bool`, defaults to `False`):
|
||||
Whether or not to use bias in attention projection layers.
|
||||
qk_norm (`bool`, defaults to `True`):
|
||||
Whether or not to use normalization after query and key projections in Attention.
|
||||
norm_elementwise_affine (`bool`, defaults to `True`):
|
||||
Whether to use learnable elementwise affine parameters for normalization.
|
||||
norm_eps (`float`, defaults to `1e-5`):
|
||||
Epsilon value for normalization layers.
|
||||
final_dropout (`bool` defaults to `False`):
|
||||
Whether to apply a final dropout after the last feed-forward layer.
|
||||
ff_inner_dim (`int`, *optional*, defaults to `None`):
|
||||
Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
|
||||
ff_bias (`bool`, defaults to `True`):
|
||||
Whether or not to use bias in Feed-forward layer.
|
||||
attention_out_bias (`bool`, defaults to `True`):
|
||||
Whether or not to use bias in Attention output projection layer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
num_attention_heads: int,
|
||||
attention_head_dim: int,
|
||||
time_embed_dim: int,
|
||||
dropout: float = 0.0,
|
||||
activation_fn: str = "gelu-approximate",
|
||||
attention_bias: bool = False,
|
||||
qk_norm: bool = True,
|
||||
norm_elementwise_affine: bool = True,
|
||||
norm_eps: float = 1e-5,
|
||||
final_dropout: bool = True,
|
||||
ff_inner_dim: Optional[int] = None,
|
||||
ff_bias: bool = True,
|
||||
attention_out_bias: bool = True,
|
||||
context_length: int = 16,
|
||||
context_stride: int = 4,
|
||||
weighting_scheme: str = "pyramid",
|
||||
prompt_interpolation_callback: Callable[[int, int, torch.Tensor, torch.Tensor], torch.Tensor] = None,
|
||||
prompt_pooling_callback: Callable[[List[torch.Tensor]], torch.Tensor] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.time_embed_dim = time_embed_dim
|
||||
self.dropout = dropout
|
||||
self.activation_fn = activation_fn
|
||||
self.attention_bias = attention_bias
|
||||
self.qk_norm = qk_norm
|
||||
self.norm_elementwise_affine = norm_elementwise_affine
|
||||
self.norm_eps = norm_eps
|
||||
self.final_dropout = final_dropout
|
||||
self.ff_inner_dim = ff_inner_dim
|
||||
self.ff_bias = ff_bias
|
||||
self.attention_out_bias = attention_out_bias
|
||||
|
||||
self.set_free_noise_properties(
|
||||
context_length, context_stride, weighting_scheme, prompt_interpolation_callback, prompt_pooling_callback
|
||||
)
|
||||
|
||||
# 1. Self Attention
|
||||
self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
|
||||
|
||||
self.attn1 = Attention(
|
||||
query_dim=dim,
|
||||
dim_head=attention_head_dim,
|
||||
heads=num_attention_heads,
|
||||
qk_norm="layer_norm" if qk_norm else None,
|
||||
eps=1e-6,
|
||||
bias=attention_bias,
|
||||
out_bias=attention_out_bias,
|
||||
processor=CogVideoXAttnProcessor2_0(),
|
||||
)
|
||||
|
||||
# 2. Feed Forward
|
||||
self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
|
||||
|
||||
self.ff = FeedForward(
|
||||
dim,
|
||||
dropout=dropout,
|
||||
activation_fn=activation_fn,
|
||||
final_dropout=final_dropout,
|
||||
inner_dim=ff_inner_dim,
|
||||
bias=ff_bias,
|
||||
)
|
||||
|
||||
# Copied from diffusers.models.attention.FreeNoiseTransformerBlock.set_free_noise_properties
|
||||
def set_free_noise_properties(
|
||||
self,
|
||||
context_length: int,
|
||||
context_stride: int,
|
||||
weighting_scheme: str = "pyramid",
|
||||
prompt_interpolation_callback: Callable[[int, int, torch.Tensor, torch.Tensor], torch.Tensor] = None,
|
||||
prompt_pooling_callback: Callable[[List[torch.Tensor]], torch.Tensor] = None,
|
||||
) -> None:
|
||||
if prompt_interpolation_callback is None:
|
||||
raise ValueError("Must pass a callback to interpolate between prompt embeddings.")
|
||||
if prompt_pooling_callback is None:
|
||||
raise ValueError("Must pass a callback to pool prompt embeddings.")
|
||||
self.context_length = context_length
|
||||
self.context_stride = context_stride
|
||||
self.weighting_scheme = weighting_scheme
|
||||
self.prompt_interpolation_callback = prompt_interpolation_callback
|
||||
self.prompt_pooling_callback = prompt_pooling_callback
|
||||
|
||||
# Copied from diffusers.models.attention.FreeNoiseTransformerBlock._get_frame_indices
|
||||
def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
|
||||
frame_indices = []
|
||||
for i in range(0, num_frames - self.context_length + 1, self.context_stride):
|
||||
window_start = i
|
||||
window_end = min(num_frames, i + self.context_length)
|
||||
frame_indices.append((window_start, window_end))
|
||||
return frame_indices
|
||||
|
||||
# Copied from diffusers.models.attention.FreeNoiseTransformerBlock._get_frame_weights
|
||||
def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
|
||||
if weighting_scheme == "flat":
|
||||
weights = [1.0] * num_frames
|
||||
|
||||
elif weighting_scheme == "pyramid":
|
||||
if num_frames % 2 == 0:
|
||||
# num_frames = 4 => [1, 2, 2, 1]
|
||||
mid = num_frames // 2
|
||||
weights = list(range(1, mid + 1))
|
||||
weights = weights + weights[::-1]
|
||||
else:
|
||||
# num_frames = 5 => [1, 2, 3, 2, 1]
|
||||
mid = (num_frames + 1) // 2
|
||||
weights = list(range(1, mid))
|
||||
weights = weights + [mid] + weights[::-1]
|
||||
|
||||
elif weighting_scheme == "delayed_reverse_sawtooth":
|
||||
if num_frames % 2 == 0:
|
||||
# num_frames = 4 => [0.01, 2, 2, 1]
|
||||
mid = num_frames // 2
|
||||
weights = [0.01] * (mid - 1) + [mid]
|
||||
weights = weights + list(range(mid, 0, -1))
|
||||
else:
|
||||
# num_frames = 5 => [0.01, 0.01, 3, 2, 1]
|
||||
mid = (num_frames + 1) // 2
|
||||
weights = [0.01] * mid
|
||||
weights = weights + list(range(mid, 0, -1))
|
||||
else:
|
||||
raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
|
||||
|
||||
return weights
|
||||
|
||||
def _prepare_free_noise_encoder_hidden_states(
|
||||
self,
|
||||
encoder_hidden_states: Union[
|
||||
torch.Tensor, List[torch.Tensor], Tuple[Dict[int, torch.Tensor], Optional[Dict[int, torch.Tensor]]]
|
||||
],
|
||||
frame_indices: List[int],
|
||||
) -> List[torch.Tensor]:
|
||||
if torch.is_tensor(encoder_hidden_states):
|
||||
encoder_hidden_states = [encoder_hidden_states.clone() for _ in range(len(frame_indices))]
|
||||
|
||||
elif isinstance(encoder_hidden_states, tuple):
|
||||
print("frame_indices:", frame_indices)
|
||||
pooled_prompt_embeds_list = []
|
||||
pooled_negative_prompt_embeds_list = []
|
||||
negative_prompt_embeds_dict, prompt_embeds_dict = encoder_hidden_states
|
||||
last_frame_start = 0
|
||||
|
||||
# For every batch of frames that is to be processed, pool the positive and negative prompt embeddings.
|
||||
# TODO(aryan): Since this is experimental, I didn't try many different things. I found from testing
|
||||
# that pooling with previous batch frame embeddings necessary to produce better results and help with
|
||||
# prompt transitions.
|
||||
for frame_start, frame_end in frame_indices:
|
||||
pooled_prompt_embeds = None
|
||||
pooled_negative_prompt_embeds = None
|
||||
|
||||
pooling_list = [
|
||||
prompt_embeds_dict[i] for i in range(last_frame_start, frame_end) if i in prompt_embeds_dict
|
||||
]
|
||||
if len(pooling_list) > 0:
|
||||
print("pooling", [i for i in range(last_frame_start, frame_end) if i in prompt_embeds_dict])
|
||||
pooled_prompt_embeds = self.prompt_pooling_callback(pooling_list)
|
||||
print("after pooling:", pooled_prompt_embeds.isnan().any())
|
||||
|
||||
if negative_prompt_embeds_dict is not None:
|
||||
pooling_list = [
|
||||
negative_prompt_embeds_dict[i]
|
||||
for i in range(last_frame_start, frame_end)
|
||||
if i in negative_prompt_embeds_dict
|
||||
]
|
||||
if len(pooling_list) > 0:
|
||||
print(
|
||||
"negative pooling", [i for i in range(last_frame_start, frame_end) if i in prompt_embeds_dict]
|
||||
)
|
||||
pooled_negative_prompt_embeds = self.prompt_pooling_callback(pooling_list)
|
||||
print("after negative pooling:", pooled_negative_prompt_embeds.isnan().any())
|
||||
|
||||
pooled_prompt_embeds_list.append(pooled_prompt_embeds)
|
||||
pooled_negative_prompt_embeds_list.append(pooled_negative_prompt_embeds)
|
||||
last_frame_start = frame_start
|
||||
|
||||
assert pooled_prompt_embeds_list[0] is not None
|
||||
assert pooled_prompt_embeds[-1] is not None
|
||||
if negative_prompt_embeds_dict is not None:
|
||||
assert pooled_negative_prompt_embeds_list[0] is not None
|
||||
assert pooled_negative_prompt_embeds_list[-1] is not None
|
||||
|
||||
# If there were no relevant prompts for certain frame batches, interpolate and fill in the gaps
|
||||
last_existent_embed_index = 0
|
||||
for i in range(1, len(frame_indices)):
|
||||
if pooled_prompt_embeds_list[i] is not None and i - last_existent_embed_index > 1:
|
||||
print("interpolating:", last_existent_embed_index, i)
|
||||
interpolated_embeds = self.prompt_interpolation_callback(
|
||||
last_existent_embed_index,
|
||||
i,
|
||||
pooled_prompt_embeds_list[last_existent_embed_index],
|
||||
pooled_prompt_embeds_list[i],
|
||||
)
|
||||
print("after interpolating", interpolated_embeds.isnan().any())
|
||||
pooled_prompt_embeds_list[last_existent_embed_index : i + 1] = interpolated_embeds.split(1, dim=0)
|
||||
last_existent_embed_index = i
|
||||
assert all(x is not None for x in pooled_prompt_embeds_list)
|
||||
|
||||
if negative_prompt_embeds_dict is not None:
|
||||
last_existent_embed_index = 0
|
||||
for i in range(1, len(frame_indices)):
|
||||
if pooled_negative_prompt_embeds_list[i] is not None and i - last_existent_embed_index > 1:
|
||||
print("negative interpolating:", last_existent_embed_index, i)
|
||||
interpolated_embeds = self.prompt_interpolation_callback(
|
||||
last_existent_embed_index,
|
||||
i,
|
||||
pooled_negative_prompt_embeds_list[last_existent_embed_index],
|
||||
pooled_negative_prompt_embeds_list[i],
|
||||
)
|
||||
print("after negative interpolating", interpolated_embeds.isnan().any())
|
||||
pooled_negative_prompt_embeds_list[
|
||||
last_existent_embed_index : i + 1
|
||||
] = interpolated_embeds.split(1, dim=0)
|
||||
last_existent_embed_index = i
|
||||
assert all(x is not None for x in pooled_negative_prompt_embeds_list)
|
||||
|
||||
if negative_prompt_embeds_dict is not None:
|
||||
# Classifier-Free Guidance
|
||||
pooled_prompt_embeds_list = [
|
||||
torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
for negative_prompt_embeds, prompt_embeds in zip(
|
||||
pooled_negative_prompt_embeds_list, pooled_prompt_embeds_list
|
||||
)
|
||||
]
|
||||
|
||||
encoder_hidden_states = pooled_prompt_embeds_list
|
||||
|
||||
elif not isinstance(encoder_hidden_states, list):
|
||||
raise ValueError(
|
||||
f"Expected `encoder_hidden_states` to be a tensor, list of tensor, or a tuple of dictionaries, but found {type(encoder_hidden_states)=}"
|
||||
)
|
||||
|
||||
assert isinstance(encoder_hidden_states, list) and len(encoder_hidden_states) == len(frame_indices)
|
||||
return encoder_hidden_states
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
temb: torch.Tensor,
|
||||
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
num_frames: int = None,
|
||||
) -> torch.Tensor:
|
||||
# hidden_states: [B, F x H x W, C]
|
||||
device = hidden_states.device
|
||||
dtype = hidden_states.dtype
|
||||
|
||||
frame_indices = self._get_frame_indices(num_frames)
|
||||
frame_weights = self._get_frame_weights(self.context_length, self.weighting_scheme)
|
||||
frame_weights = (
|
||||
torch.tensor(frame_weights, device=device, dtype=dtype).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||
)
|
||||
is_last_frame_batch_complete = frame_indices[-1][1] == num_frames
|
||||
|
||||
# Handle out-of-bounds case if num_frames isn't perfectly divisible by context_length
|
||||
# For example, num_frames=25, context_length=16, context_stride=4, then we expect the ranges:
|
||||
# [(0, 16), (4, 20), (8, 24), (10, 26)]
|
||||
if not is_last_frame_batch_complete:
|
||||
if num_frames < self.context_length:
|
||||
raise ValueError(f"Expected {num_frames=} to be greater or equal than {self.context_length=}")
|
||||
last_frame_batch_length = num_frames - frame_indices[-1][1]
|
||||
frame_indices.append((num_frames - self.context_length, num_frames))
|
||||
|
||||
# Unflatten frame dimension: [B, F, HW, C]
|
||||
batch_size, frames_height_width, channels = hidden_states.shape
|
||||
hidden_states = hidden_states.reshape(batch_size, num_frames, frames_height_width // num_frames, channels)
|
||||
encoder_hidden_states = self._prepare_free_noise_encoder_hidden_states(encoder_hidden_states, frame_indices)
|
||||
|
||||
num_times_accumulated = torch.zeros((1, num_frames, 1, 1), device=device)
|
||||
accumulated_values = torch.zeros_like(hidden_states)
|
||||
|
||||
text_seq_length = _get_text_seq_length(encoder_hidden_states)
|
||||
|
||||
for i, (frame_start, frame_end) in enumerate(frame_indices):
|
||||
# The reason for slicing here is to handle cases like frame_indices=[(0, 16), (16, 20)],
|
||||
# if the user provided a video with 19 frames, or essentially a non-multiple of `context_length`.
|
||||
weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
|
||||
weights *= frame_weights
|
||||
|
||||
# Flatten frame dimension: [B, F'HW, C]
|
||||
hidden_states_chunk = hidden_states[:, frame_start:frame_end].flatten(1, 2)
|
||||
print(
|
||||
"debug:",
|
||||
text_seq_length,
|
||||
torch.isnan(hidden_states_chunk).any(),
|
||||
torch.isnan(encoder_hidden_states[i]).any(),
|
||||
)
|
||||
|
||||
# norm & modulate
|
||||
norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
|
||||
hidden_states_chunk, encoder_hidden_states[i], temb
|
||||
)
|
||||
|
||||
# attention
|
||||
attn_hidden_states, attn_encoder_hidden_states = self.attn1(
|
||||
hidden_states=norm_hidden_states,
|
||||
encoder_hidden_states=norm_encoder_hidden_states,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
)
|
||||
|
||||
hidden_states_chunk = hidden_states_chunk + gate_msa * attn_hidden_states
|
||||
encoder_hidden_states[i] = encoder_hidden_states[i] + enc_gate_msa * attn_encoder_hidden_states
|
||||
|
||||
# norm & modulate
|
||||
norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
|
||||
hidden_states_chunk, encoder_hidden_states[i], temb
|
||||
)
|
||||
|
||||
# feed-forward
|
||||
norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
|
||||
ff_output = self.ff(norm_hidden_states)
|
||||
|
||||
hidden_states_chunk = hidden_states_chunk + gate_ff * ff_output[:, text_seq_length:]
|
||||
encoder_hidden_states[i] = encoder_hidden_states[i] + enc_gate_ff * ff_output[:, :text_seq_length]
|
||||
|
||||
# Unflatten frame dimension: [B, F', HW, C]
|
||||
_num_frames = frame_end - frame_start
|
||||
hidden_states_chunk = hidden_states_chunk.reshape(batch_size, _num_frames, -1, channels)
|
||||
|
||||
if i == len(frame_indices) - 1 and not is_last_frame_batch_complete:
|
||||
accumulated_values[:, -last_frame_batch_length:] += (
|
||||
hidden_states_chunk[:, -last_frame_batch_length:] * weights[:, -last_frame_batch_length:]
|
||||
)
|
||||
num_times_accumulated[:, -last_frame_batch_length:] += weights[:, -last_frame_batch_length]
|
||||
else:
|
||||
accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
|
||||
num_times_accumulated[:, frame_start:frame_end] += weights
|
||||
|
||||
# TODO(aryan): Maybe this could be done in a better way.
|
||||
#
|
||||
# Previously, this was:
|
||||
# hidden_states = torch.where(
|
||||
# num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
|
||||
# )
|
||||
#
|
||||
# The reasoning for the change here is `torch.where` became a bottleneck at some point when golfing memory
|
||||
# spikes. It is particularly noticeable when the number of frames is high. My understanding is that this comes
|
||||
# from tensors being copied - which is why we resort to spliting and concatenating here. I've not particularly
|
||||
# looked into this deeply because other memory optimizations led to more pronounced reductions.
|
||||
hidden_states = torch.cat(
|
||||
[
|
||||
torch.where(num_times_split > 0, accumulated_split / num_times_split, accumulated_split)
|
||||
for accumulated_split, num_times_split in zip(
|
||||
accumulated_values.split(self.context_length, dim=1),
|
||||
num_times_accumulated.split(self.context_length, dim=1),
|
||||
)
|
||||
],
|
||||
dim=1,
|
||||
).to(dtype)
|
||||
|
||||
# Flatten frame dimension: [B, FHW, C]
|
||||
hidden_states = hidden_states.flatten(1, 2)
|
||||
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
"""
|
||||
A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
|
||||
@@ -658,29 +239,33 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
super().__init__()
|
||||
inner_dim = num_attention_heads * attention_head_dim
|
||||
|
||||
post_patch_height = sample_height // patch_size
|
||||
post_patch_width = sample_width // patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // temporal_compression_ratio + 1
|
||||
self.num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
||||
|
||||
# 1. Patch embedding
|
||||
self.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_size,
|
||||
in_channels=in_channels,
|
||||
embed_dim=inner_dim,
|
||||
text_embed_dim=text_embed_dim,
|
||||
bias=True,
|
||||
sample_width=sample_width,
|
||||
sample_height=sample_height,
|
||||
sample_frames=sample_frames,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
max_text_seq_length=max_text_seq_length,
|
||||
spatial_interpolation_scale=spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=temporal_interpolation_scale,
|
||||
use_positional_embeddings=not use_rotary_positional_embeddings,
|
||||
)
|
||||
self.patch_embed = CogVideoXPatchEmbed(patch_size, in_channels, inner_dim, text_embed_dim, bias=True)
|
||||
self.embedding_dropout = nn.Dropout(dropout)
|
||||
|
||||
# 2. Time embeddings
|
||||
# 2. 3D positional embeddings
|
||||
spatial_pos_embedding = get_3d_sincos_pos_embed(
|
||||
inner_dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
spatial_interpolation_scale,
|
||||
temporal_interpolation_scale,
|
||||
)
|
||||
spatial_pos_embedding = torch.from_numpy(spatial_pos_embedding).flatten(0, 1)
|
||||
pos_embedding = torch.zeros(1, max_text_seq_length + self.num_patches, inner_dim, requires_grad=False)
|
||||
pos_embedding.data[:, max_text_seq_length:].copy_(spatial_pos_embedding)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=False)
|
||||
|
||||
# 3. Time embeddings
|
||||
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
|
||||
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
|
||||
|
||||
# 3. Define spatio-temporal transformers blocks
|
||||
# 4. Define spatio-temporal transformers blocks
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
CogVideoXBlock(
|
||||
@@ -699,7 +284,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
)
|
||||
self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
|
||||
|
||||
# 4. Output blocks
|
||||
# 5. Output blocks
|
||||
self.norm_out = AdaLayerNorm(
|
||||
embedding_dim=time_embed_dim,
|
||||
output_dim=2 * inner_dim,
|
||||
@@ -836,11 +421,21 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
emb = self.time_embedding(t_emb, timestep_cond)
|
||||
|
||||
# 2. Patch embedding
|
||||
text_seq_length = _get_text_seq_length(encoder_hidden_states)
|
||||
encoder_hidden_states, hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
|
||||
|
||||
# 3. Transformer blocks
|
||||
# 3. Position embedding
|
||||
text_seq_length = encoder_hidden_states.shape[1]
|
||||
if not self.config.use_rotary_positional_embeddings:
|
||||
seq_length = height * width * num_frames // (self.config.patch_size**2)
|
||||
|
||||
pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
|
||||
hidden_states = hidden_states + pos_embeds
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
|
||||
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# 4. Transformer blocks
|
||||
for i, block in enumerate(self.transformer_blocks):
|
||||
if self.training and self.gradient_checkpointing:
|
||||
|
||||
@@ -857,7 +452,6 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
encoder_hidden_states,
|
||||
emb,
|
||||
image_rotary_emb,
|
||||
num_frames,
|
||||
**ckpt_kwargs,
|
||||
)
|
||||
else:
|
||||
@@ -866,7 +460,6 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=emb,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
num_frames=num_frames,
|
||||
)
|
||||
|
||||
if not self.config.use_rotary_positional_embeddings:
|
||||
@@ -878,11 +471,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
hidden_states = self.norm_final(hidden_states)
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# 4. Final block
|
||||
# 5. Final block
|
||||
hidden_states = self.norm_out(hidden_states, temb=emb)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
|
||||
# 5. Unpatchify
|
||||
# 6. Unpatchify
|
||||
p = self.config.patch_size
|
||||
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, channels, p, p)
|
||||
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
||||
@@ -890,13 +483,3 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
||||
if not return_dict:
|
||||
return (output,)
|
||||
return Transformer2DModelOutput(sample=output)
|
||||
|
||||
|
||||
def _get_text_seq_length(x) -> int:
|
||||
if isinstance(x, torch.Tensor):
|
||||
return x.shape[1]
|
||||
if isinstance(x, list):
|
||||
return _get_text_seq_length(x[0])
|
||||
if isinstance(x, dict):
|
||||
return _get_text_seq_length(next(iter(x.values())))
|
||||
return None
|
||||
|
||||
@@ -465,7 +465,6 @@ class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig
|
||||
"Please remove the batch dimension and pass it as a 2d torch Tensor"
|
||||
)
|
||||
img_ids = img_ids[0]
|
||||
|
||||
ids = torch.cat((txt_ids, img_ids), dim=0)
|
||||
image_rotary_emb = self.pos_embed(ids)
|
||||
|
||||
|
||||
@@ -463,6 +463,7 @@ class UNet2DConditionModel(
|
||||
dropout=dropout,
|
||||
)
|
||||
self.up_blocks.append(up_block)
|
||||
prev_output_channel = output_channel
|
||||
|
||||
# out
|
||||
if norm_num_groups is not None:
|
||||
@@ -598,7 +599,7 @@ class UNet2DConditionModel(
|
||||
)
|
||||
elif encoder_hid_dim_type is not None:
|
||||
raise ValueError(
|
||||
f"`encoder_hid_dim_type`: {encoder_hid_dim_type} must be None, 'text_proj', 'text_image_proj', or 'image_proj'."
|
||||
f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
|
||||
)
|
||||
else:
|
||||
self.encoder_hid_proj = None
|
||||
@@ -678,9 +679,7 @@ class UNet2DConditionModel(
|
||||
# Kandinsky 2.2 ControlNet
|
||||
self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
|
||||
elif addition_embed_type is not None:
|
||||
raise ValueError(
|
||||
f"`addition_embed_type`: {addition_embed_type} must be None, 'text', 'text_image', 'text_time', 'image', or 'image_hint'."
|
||||
)
|
||||
raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
|
||||
|
||||
def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
|
||||
if attention_type in ["gated", "gated-text-image"]:
|
||||
@@ -991,7 +990,7 @@ class UNet2DConditionModel(
|
||||
image_embs = added_cond_kwargs.get("image_embeds")
|
||||
aug_emb = self.add_embedding(image_embs)
|
||||
elif self.config.addition_embed_type == "image_hint":
|
||||
# Kandinsky 2.2 ControlNet - style
|
||||
# Kandinsky 2.2 - style
|
||||
if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
|
||||
@@ -1010,7 +1009,7 @@ class UNet2DConditionModel(
|
||||
# Kandinsky 2.1 - style
|
||||
if "image_embeds" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
|
||||
)
|
||||
|
||||
image_embeds = added_cond_kwargs.get("image_embeds")
|
||||
@@ -1019,14 +1018,14 @@ class UNet2DConditionModel(
|
||||
# Kandinsky 2.2 - style
|
||||
if "image_embeds" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
|
||||
)
|
||||
image_embeds = added_cond_kwargs.get("image_embeds")
|
||||
encoder_hidden_states = self.encoder_hid_proj(image_embeds)
|
||||
elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
|
||||
if "image_embeds" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
|
||||
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
|
||||
)
|
||||
|
||||
if hasattr(self, "text_encoder_hid_proj") and self.text_encoder_hid_proj is not None:
|
||||
@@ -1141,6 +1140,7 @@ class UNet2DConditionModel(
|
||||
# 1. time
|
||||
t_emb = self.get_time_embed(sample=sample, timestep=timestep)
|
||||
emb = self.time_embedding(t_emb, timestep_cond)
|
||||
aug_emb = None
|
||||
|
||||
class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
|
||||
if class_emb is not None:
|
||||
|
||||
@@ -116,7 +116,7 @@ class AnimateDiffTransformer3D(nn.Module):
|
||||
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
self.proj_in = nn.Linear(in_channels, inner_dim)
|
||||
|
||||
# 3. Define transformers blocks
|
||||
@@ -187,12 +187,12 @@ class AnimateDiffTransformer3D(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)
|
||||
|
||||
hidden_states = self.proj_in(input=hidden_states)
|
||||
hidden_states = self.proj_in(hidden_states)
|
||||
|
||||
# 2. Blocks
|
||||
for block in self.transformer_blocks:
|
||||
hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
timestep=timestep,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
@@ -200,7 +200,7 @@ class AnimateDiffTransformer3D(nn.Module):
|
||||
)
|
||||
|
||||
# 3. Output
|
||||
hidden_states = self.proj_out(input=hidden_states)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
hidden_states = (
|
||||
hidden_states[None, None, :]
|
||||
.reshape(batch_size, height, width, num_frames, channel)
|
||||
@@ -344,7 +344,7 @@ class DownBlockMotion(nn.Module):
|
||||
)
|
||||
|
||||
else:
|
||||
hidden_states = resnet(input_tensor=hidden_states, temb=temb)
|
||||
hidden_states = resnet(hidden_states, temb)
|
||||
|
||||
hidden_states = motion_module(hidden_states, num_frames=num_frames)
|
||||
|
||||
@@ -352,7 +352,7 @@ class DownBlockMotion(nn.Module):
|
||||
|
||||
if self.downsamplers is not None:
|
||||
for downsampler in self.downsamplers:
|
||||
hidden_states = downsampler(hidden_states=hidden_states)
|
||||
hidden_states = downsampler(hidden_states)
|
||||
|
||||
output_states = output_states + (hidden_states,)
|
||||
|
||||
@@ -531,18 +531,25 @@ class CrossAttnDownBlockMotion(nn.Module):
|
||||
temb,
|
||||
**ckpt_kwargs,
|
||||
)
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
else:
|
||||
hidden_states = resnet(input_tensor=hidden_states, temb=temb)
|
||||
|
||||
hidden_states = attn(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = resnet(hidden_states, temb)
|
||||
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = motion_module(
|
||||
hidden_states,
|
||||
num_frames=num_frames,
|
||||
@@ -556,7 +563,7 @@ class CrossAttnDownBlockMotion(nn.Module):
|
||||
|
||||
if self.downsamplers is not None:
|
||||
for downsampler in self.downsamplers:
|
||||
hidden_states = downsampler(hidden_states=hidden_states)
|
||||
hidden_states = downsampler(hidden_states)
|
||||
|
||||
output_states = output_states + (hidden_states,)
|
||||
|
||||
@@ -750,18 +757,25 @@ class CrossAttnUpBlockMotion(nn.Module):
|
||||
temb,
|
||||
**ckpt_kwargs,
|
||||
)
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
else:
|
||||
hidden_states = resnet(input_tensor=hidden_states, temb=temb)
|
||||
|
||||
hidden_states = attn(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = resnet(hidden_states, temb)
|
||||
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = motion_module(
|
||||
hidden_states,
|
||||
num_frames=num_frames,
|
||||
@@ -769,7 +783,7 @@ class CrossAttnUpBlockMotion(nn.Module):
|
||||
|
||||
if self.upsamplers is not None:
|
||||
for upsampler in self.upsamplers:
|
||||
hidden_states = upsampler(hidden_states=hidden_states, output_size=upsample_size)
|
||||
hidden_states = upsampler(hidden_states, upsample_size)
|
||||
|
||||
return hidden_states
|
||||
|
||||
@@ -915,13 +929,13 @@ class UpBlockMotion(nn.Module):
|
||||
create_custom_forward(resnet), hidden_states, temb
|
||||
)
|
||||
else:
|
||||
hidden_states = resnet(input_tensor=hidden_states, temb=temb)
|
||||
hidden_states = resnet(hidden_states, temb)
|
||||
|
||||
hidden_states = motion_module(hidden_states, num_frames=num_frames)
|
||||
|
||||
if self.upsamplers is not None:
|
||||
for upsampler in self.upsamplers:
|
||||
hidden_states = upsampler(hidden_states=hidden_states, output_size=upsample_size)
|
||||
hidden_states = upsampler(hidden_states, upsample_size)
|
||||
|
||||
return hidden_states
|
||||
|
||||
@@ -1066,19 +1080,10 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
|
||||
if cross_attention_kwargs.get("scale", None) is not None:
|
||||
logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
|
||||
|
||||
hidden_states = self.resnets[0](input_tensor=hidden_states, temb=temb)
|
||||
hidden_states = self.resnets[0](hidden_states, temb)
|
||||
|
||||
blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
|
||||
for attn, resnet, motion_module in blocks:
|
||||
hidden_states = attn(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
if self.training and self.gradient_checkpointing:
|
||||
|
||||
def create_custom_forward(module, return_dict=None):
|
||||
@@ -1091,6 +1096,14 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
|
||||
return custom_forward
|
||||
|
||||
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(motion_module),
|
||||
hidden_states,
|
||||
@@ -1104,11 +1117,19 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
|
||||
**ckpt_kwargs,
|
||||
)
|
||||
else:
|
||||
hidden_states = attn(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
hidden_states = motion_module(
|
||||
hidden_states,
|
||||
num_frames=num_frames,
|
||||
)
|
||||
hidden_states = resnet(input_tensor=hidden_states, temb=temb)
|
||||
hidden_states = resnet(hidden_states, temb)
|
||||
|
||||
return hidden_states
|
||||
|
||||
@@ -2157,6 +2178,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Peft
|
||||
|
||||
emb = emb if aug_emb is None else emb + aug_emb
|
||||
emb = emb.repeat_interleave(repeats=num_frames, dim=0)
|
||||
encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
|
||||
if "image_embeds" not in added_cond_kwargs:
|
||||
|
||||
@@ -123,14 +123,8 @@ else:
|
||||
"AnimateDiffSDXLPipeline",
|
||||
"AnimateDiffSparseControlNetPipeline",
|
||||
"AnimateDiffVideoToVideoPipeline",
|
||||
"AnimateDiffVideoToVideoControlNetPipeline",
|
||||
]
|
||||
_import_structure["flux"] = [
|
||||
"FluxControlNetPipeline",
|
||||
"FluxImg2ImgPipeline",
|
||||
"FluxInpaintPipeline",
|
||||
"FluxPipeline",
|
||||
]
|
||||
_import_structure["flux"] = ["FluxPipeline", "FluxControlNetPipeline"]
|
||||
_import_structure["audioldm"] = ["AudioLDMPipeline"]
|
||||
_import_structure["audioldm2"] = [
|
||||
"AudioLDM2Pipeline",
|
||||
@@ -138,7 +132,7 @@ else:
|
||||
"AudioLDM2UNet2DConditionModel",
|
||||
]
|
||||
_import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
|
||||
_import_structure["cogvideo"] = ["CogVideoXPipeline", "CogVideoXVideoToVideoPipeline"]
|
||||
_import_structure["cogvideo"] = ["CogVideoXPipeline"]
|
||||
_import_structure["controlnet"].extend(
|
||||
[
|
||||
"BlipDiffusionControlNetPipeline",
|
||||
@@ -450,7 +444,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
AnimateDiffPipeline,
|
||||
AnimateDiffSDXLPipeline,
|
||||
AnimateDiffSparseControlNetPipeline,
|
||||
AnimateDiffVideoToVideoControlNetPipeline,
|
||||
AnimateDiffVideoToVideoPipeline,
|
||||
)
|
||||
from .audioldm import AudioLDMPipeline
|
||||
@@ -461,7 +454,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
)
|
||||
from .aura_flow import AuraFlowPipeline
|
||||
from .blip_diffusion import BlipDiffusionPipeline
|
||||
from .cogvideo import CogVideoXPipeline, CogVideoXVideoToVideoPipeline
|
||||
from .cogvideo import CogVideoXPipeline
|
||||
from .controlnet import (
|
||||
BlipDiffusionControlNetPipeline,
|
||||
StableDiffusionControlNetImg2ImgPipeline,
|
||||
@@ -501,7 +494,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
VersatileDiffusionTextToImagePipeline,
|
||||
VQDiffusionPipeline,
|
||||
)
|
||||
from .flux import FluxControlNetPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline, FluxPipeline
|
||||
from .flux import FluxControlNetPipeline, FluxPipeline
|
||||
from .hunyuandit import HunyuanDiTPipeline
|
||||
from .i2vgen_xl import I2VGenXLPipeline
|
||||
from .kandinsky import (
|
||||
|
||||
@@ -26,7 +26,6 @@ else:
|
||||
_import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
|
||||
_import_structure["pipeline_animatediff_sparsectrl"] = ["AnimateDiffSparseControlNetPipeline"]
|
||||
_import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
|
||||
_import_structure["pipeline_animatediff_video2video_controlnet"] = ["AnimateDiffVideoToVideoControlNetPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
@@ -41,7 +40,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
|
||||
from .pipeline_animatediff_sparsectrl import AnimateDiffSparseControlNetPipeline
|
||||
from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
|
||||
from .pipeline_animatediff_video2video_controlnet import AnimateDiffVideoToVideoControlNetPipeline
|
||||
from .pipeline_output import AnimateDiffPipelineOutput
|
||||
|
||||
else:
|
||||
|
||||
@@ -432,6 +432,7 @@ class AnimateDiffPipeline(
|
||||
extra_step_kwargs["generator"] = generator
|
||||
return extra_step_kwargs
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
@@ -469,8 +470,8 @@ class AnimateDiffPipeline(
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and not isinstance(prompt, (str, list, dict)):
|
||||
raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)=}")
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if negative_prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
@@ -556,15 +557,11 @@ class AnimateDiffPipeline(
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Optional[Union[str, List[str]]] = None,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
num_frames: Optional[int] = 16,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
@@ -704,10 +701,9 @@ class AnimateDiffPipeline(
|
||||
self._guidance_scale = guidance_scale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, (str, dict)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
@@ -720,39 +716,22 @@ class AnimateDiffPipeline(
|
||||
text_encoder_lora_scale = (
|
||||
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
|
||||
)
|
||||
if self.free_noise_enabled:
|
||||
prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
|
||||
prompt=prompt,
|
||||
num_frames=num_frames,
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
negative_prompt=negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
else:
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
@@ -804,9 +783,6 @@ class AnimateDiffPipeline(
|
||||
# 8. Denoising loop
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
@@ -505,8 +505,8 @@ class AnimateDiffControlNetPipeline(
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and not isinstance(prompt, (str, list, dict)):
|
||||
raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if negative_prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
@@ -699,10 +699,6 @@ class AnimateDiffControlNetPipeline(
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
@@ -862,10 +858,9 @@ class AnimateDiffControlNetPipeline(
|
||||
self._guidance_scale = guidance_scale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, (str, dict)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
@@ -888,39 +883,22 @@ class AnimateDiffControlNetPipeline(
|
||||
text_encoder_lora_scale = (
|
||||
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
||||
)
|
||||
if self.free_noise_enabled:
|
||||
prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
|
||||
prompt=prompt,
|
||||
num_frames=num_frames,
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
negative_prompt=negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
else:
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
@@ -1012,9 +990,6 @@ class AnimateDiffControlNetPipeline(
|
||||
# 8. Denoising loop
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
@@ -1027,6 +1002,7 @@ class AnimateDiffControlNetPipeline(
|
||||
else:
|
||||
control_model_input = latent_model_input
|
||||
controlnet_prompt_embeds = prompt_embeds
|
||||
controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0)
|
||||
|
||||
if isinstance(controlnet_keep[i], list):
|
||||
cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
|
||||
|
||||
@@ -1143,8 +1143,6 @@ class AnimateDiffSDXLPipeline(
|
||||
add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
|
||||
add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
prompt_embeds = prompt_embeds.to(device)
|
||||
add_text_embeds = add_text_embeds.to(device)
|
||||
add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
|
||||
|
||||
@@ -878,8 +878,6 @@ class AnimateDiffSparseControlNetPipeline(
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
# 4. Prepare IP-Adapter embeddings
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
|
||||
@@ -246,6 +246,7 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt,
|
||||
@@ -298,7 +299,7 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
else:
|
||||
scale_lora_layers(self.text_encoder, lora_scale)
|
||||
|
||||
if prompt is not None and isinstance(prompt, (str, dict)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
@@ -581,8 +582,8 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and not isinstance(prompt, (str, list, dict)):
|
||||
raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if negative_prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
@@ -627,20 +628,23 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
video: Optional[torch.Tensor] = None,
|
||||
height: int = 64,
|
||||
width: int = 64,
|
||||
num_channels_latents: int = 4,
|
||||
batch_size: int = 1,
|
||||
timestep: Optional[int] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
video,
|
||||
height,
|
||||
width,
|
||||
num_channels_latents,
|
||||
batch_size,
|
||||
timestep,
|
||||
dtype,
|
||||
device,
|
||||
generator,
|
||||
latents=None,
|
||||
decode_chunk_size: int = 16,
|
||||
add_noise: bool = False,
|
||||
) -> torch.Tensor:
|
||||
num_frames = video.shape[1] if latents is None else latents.shape[2]
|
||||
):
|
||||
if latents is None:
|
||||
num_frames = video.shape[1]
|
||||
else:
|
||||
num_frames = latents.shape[2]
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
@@ -704,13 +708,8 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
if shape != latents.shape:
|
||||
# [B, C, F, H, W]
|
||||
raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
|
||||
|
||||
latents = latents.to(device, dtype=dtype)
|
||||
|
||||
if add_noise:
|
||||
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
latents = self.scheduler.add_noise(latents, noise, timestep)
|
||||
|
||||
return latents
|
||||
|
||||
@property
|
||||
@@ -736,10 +735,6 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
@@ -748,7 +743,6 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
enforce_inference_steps: bool = False,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
guidance_scale: float = 7.5,
|
||||
@@ -880,10 +874,9 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
self._guidance_scale = guidance_scale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, (str, dict)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
@@ -891,85 +884,29 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
device = self._execution_device
|
||||
dtype = self.dtype
|
||||
|
||||
# 3. Prepare timesteps
|
||||
if not enforce_inference_steps:
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
||||
)
|
||||
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
||||
else:
|
||||
denoising_inference_steps = int(num_inference_steps / strength)
|
||||
timesteps, denoising_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, denoising_inference_steps, device, timesteps, sigmas
|
||||
)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
||||
|
||||
# 4. Prepare latent variables
|
||||
if latents is None:
|
||||
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
||||
# Move the number of frames before the number of channels.
|
||||
video = video.permute(0, 2, 1, 3, 4)
|
||||
video = video.to(device=device, dtype=dtype)
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
latents = self.prepare_latents(
|
||||
video=video,
|
||||
height=height,
|
||||
width=width,
|
||||
num_channels_latents=num_channels_latents,
|
||||
batch_size=batch_size * num_videos_per_prompt,
|
||||
timestep=latent_timestep,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
decode_chunk_size=decode_chunk_size,
|
||||
add_noise=enforce_inference_steps,
|
||||
)
|
||||
|
||||
# 5. Encode input prompt
|
||||
# 3. Encode input prompt
|
||||
text_encoder_lora_scale = (
|
||||
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
|
||||
)
|
||||
num_frames = latents.shape[2]
|
||||
if self.free_noise_enabled:
|
||||
prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
|
||||
prompt=prompt,
|
||||
num_frames=num_frames,
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
negative_prompt=negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
else:
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_videos_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
# 6. Prepare IP-Adapter embeddings
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image,
|
||||
@@ -979,10 +916,38 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
self.do_classifier_free_guidance,
|
||||
)
|
||||
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
# 4. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
||||
)
|
||||
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
||||
|
||||
# 5. Prepare latent variables
|
||||
if latents is None:
|
||||
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
||||
# Move the number of frames before the number of channels.
|
||||
video = video.permute(0, 2, 1, 3, 4)
|
||||
video = video.to(device=device, dtype=prompt_embeds.dtype)
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
latents = self.prepare_latents(
|
||||
video=video,
|
||||
height=height,
|
||||
width=width,
|
||||
num_channels_latents=num_channels_latents,
|
||||
batch_size=batch_size * num_videos_per_prompt,
|
||||
timestep=latent_timestep,
|
||||
dtype=prompt_embeds.dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
decode_chunk_size=decode_chunk_size,
|
||||
)
|
||||
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 8. Add image embeds for IP-Adapter
|
||||
# 7. Add image embeds for IP-Adapter
|
||||
added_cond_kwargs = (
|
||||
{"image_embeds": image_embeds}
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
|
||||
@@ -1002,12 +967,9 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
self._num_timesteps = len(timesteps)
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
|
||||
# 9. Denoising loop
|
||||
# 8. Denoising loop
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
@@ -1043,14 +1005,14 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
# 10. Post-processing
|
||||
# 9. Post-processing
|
||||
if output_type == "latent":
|
||||
video = latents
|
||||
else:
|
||||
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
||||
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
||||
|
||||
# 11. Offload all models
|
||||
# 10. Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,7 +29,7 @@ from .controlnet import (
|
||||
StableDiffusionXLControlNetPipeline,
|
||||
)
|
||||
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
|
||||
from .flux import FluxControlNetPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline, FluxPipeline
|
||||
from .flux import FluxPipeline
|
||||
from .hunyuandit import HunyuanDiTPipeline
|
||||
from .kandinsky import (
|
||||
KandinskyCombinedPipeline,
|
||||
@@ -108,7 +108,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
||||
("pixart-sigma-pag", PixArtSigmaPAGPipeline),
|
||||
("auraflow", AuraFlowPipeline),
|
||||
("flux", FluxPipeline),
|
||||
("flux-controlnet", FluxControlNetPipeline),
|
||||
("lumina", LuminaText2ImgPipeline),
|
||||
]
|
||||
)
|
||||
@@ -127,7 +126,6 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
||||
("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
|
||||
("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
|
||||
("lcm", LatentConsistencyModelImg2ImgPipeline),
|
||||
("flux", FluxImg2ImgPipeline),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -142,7 +140,6 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
|
||||
("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
|
||||
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
|
||||
("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
|
||||
("flux", FluxInpaintPipeline),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -663,17 +660,12 @@ class AutoPipelineForImage2Image(ConfigMixin):
|
||||
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
||||
orig_class_name = config["_class_name"]
|
||||
|
||||
# the `orig_class_name` can be:
|
||||
# `- *Pipeline` (for regular text-to-image checkpoint)
|
||||
# `- *Img2ImgPipeline` (for refiner checkpoint)
|
||||
to_replace = "Img2ImgPipeline" if "Img2Img" in config["_class_name"] else "Pipeline"
|
||||
|
||||
if "controlnet" in kwargs:
|
||||
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
||||
orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
|
||||
if "enable_pag" in kwargs:
|
||||
enable_pag = kwargs.pop("enable_pag")
|
||||
if enable_pag:
|
||||
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
||||
orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
|
||||
|
||||
image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
|
||||
|
||||
@@ -960,17 +952,14 @@ class AutoPipelineForInpainting(ConfigMixin):
|
||||
config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
|
||||
orig_class_name = config["_class_name"]
|
||||
|
||||
# The `orig_class_name`` can be:
|
||||
# `- *InpaintPipeline` (for inpaint-specific checkpoint)
|
||||
# - or *Pipeline (for regular text-to-image checkpoint)
|
||||
to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
|
||||
|
||||
if "controlnet" in kwargs:
|
||||
orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
|
||||
orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
|
||||
if "enable_pag" in kwargs:
|
||||
enable_pag = kwargs.pop("enable_pag")
|
||||
if enable_pag:
|
||||
orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
|
||||
to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
|
||||
orig_class_name = config["_class_name"].replace(to_replace, "PAG" + to_replace)
|
||||
|
||||
inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
|
||||
|
||||
kwargs = {**load_config_kwargs, **kwargs}
|
||||
|
||||
@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
|
||||
_import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
@@ -34,7 +33,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_cogvideox import CogVideoXPipeline
|
||||
from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import inspect
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
@@ -25,11 +26,9 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
||||
from ...models.embeddings import get_3d_rotary_pos_embed
|
||||
from ...pipelines.pipeline_utils import DiffusionPipeline
|
||||
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
||||
from ...utils import logging, replace_example_docstring
|
||||
from ...utils import BaseOutput, logging, replace_example_docstring
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ...video_processor import VideoProcessor
|
||||
from ..free_noise_utils import CogVideoXFreeNoiseMixin
|
||||
from .pipeline_output import CogVideoXPipelineOutput
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
@@ -137,7 +136,22 @@ def retrieve_timesteps(
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
@dataclass
|
||||
class CogVideoXPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for CogVideo pipelines.
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
||||
`(batch_size, num_frames, channels, height, width)`.
|
||||
"""
|
||||
|
||||
frames: torch.Tensor
|
||||
|
||||
|
||||
class CogVideoXPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-video generation using CogVideoX.
|
||||
|
||||
@@ -317,17 +331,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
def prepare_latents(
|
||||
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
|
||||
):
|
||||
num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
||||
|
||||
# If FreeNoise is enabled, generate latents as described in Equation (7) of [FreeNoise](https://arxiv.org/abs/2310.15169)
|
||||
if self.free_noise_enabled:
|
||||
latents = self._prepare_latents_free_noise(
|
||||
batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents
|
||||
)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
num_frames,
|
||||
(num_frames - 1) // self.vae_scale_factor_temporal + 1,
|
||||
num_channels_latents,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
@@ -401,8 +407,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and not isinstance(prompt, (str, list, dict)):
|
||||
raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)=}")
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
@@ -457,6 +463,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
crops_coords=grid_crops_coords,
|
||||
grid_size=(grid_height, grid_width),
|
||||
temporal_size=num_frames,
|
||||
use_real=True,
|
||||
)
|
||||
|
||||
freqs_cos = freqs_cos.to(device=device)
|
||||
@@ -479,7 +486,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Optional[Union[str, List[str], Dict[int, str]]] = None,
|
||||
prompt: Optional[Union[str, List[str]]] = None,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: int = 480,
|
||||
width: int = 720,
|
||||
@@ -506,7 +513,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]` or `Dict[str]`, *optional*):
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
||||
instead.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
@@ -578,6 +585,11 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
||||
"""
|
||||
|
||||
if num_frames > 49:
|
||||
raise ValueError(
|
||||
"The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
@@ -599,7 +611,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Default call parameters
|
||||
if prompt is not None and isinstance(prompt, (str, dict)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
@@ -614,33 +626,18 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
|
||||
# 3. Encode input prompt
|
||||
if self.free_noise_enabled:
|
||||
prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
|
||||
prompt=prompt,
|
||||
num_frames=num_frames,
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=do_classifier_free_guidance,
|
||||
negative_prompt=negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
)
|
||||
prompt_embeds_dtype = next(iter(prompt_embeds[0].values())).dtype
|
||||
else:
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
negative_prompt,
|
||||
do_classifier_free_guidance,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
)
|
||||
prompt_embeds_dtype = prompt_embeds.dtype
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
negative_prompt,
|
||||
do_classifier_free_guidance,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
)
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
||||
|
||||
# 4. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
@@ -654,7 +651,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
num_frames,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds_dtype,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
latents,
|
||||
@@ -718,7 +715,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXFreeNoiseMixin):
|
||||
**extra_step_kwargs,
|
||||
return_dict=False,
|
||||
)
|
||||
latents = latents.to(prompt_embeds_dtype)
|
||||
latents = latents.to(prompt_embeds.dtype)
|
||||
|
||||
# call the callback, if provided
|
||||
if callback_on_step_end is not None:
|
||||
|
||||
@@ -1,812 +0,0 @@
|
||||
# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import math
|
||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import T5EncoderModel, T5Tokenizer
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
||||
from ...models.embeddings import get_3d_rotary_pos_embed
|
||||
from ...pipelines.pipeline_utils import DiffusionPipeline
|
||||
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
||||
from ...utils import (
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ...video_processor import VideoProcessor
|
||||
from .pipeline_output import CogVideoXPipelineOutput
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from diffusers import CogVideoXDPMScheduler, CogVideoXVideoToVideoPipeline
|
||||
>>> from diffusers.utils import export_to_video, load_video
|
||||
|
||||
>>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
|
||||
>>> pipe = CogVideoXVideoToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
|
||||
>>> pipe.to("cuda")
|
||||
>>> pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
>>> input_video = load_video(
|
||||
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
|
||||
... )
|
||||
>>> prompt = (
|
||||
... "An astronaut stands triumphantly at the peak of a towering mountain. Panorama of rugged peaks and "
|
||||
... "valleys. Very futuristic vibe and animated aesthetic. Highlights of purple and golden colors in "
|
||||
... "the scene. The sky is looks like an animated/cartoonish dream of galaxies, nebulae, stars, planets, "
|
||||
... "moons, but the remainder of the scene is mostly realistic."
|
||||
... )
|
||||
|
||||
>>> video = pipe(
|
||||
... video=input_video, prompt=prompt, strength=0.8, guidance_scale=6, num_inference_steps=50
|
||||
... ).frames[0]
|
||||
>>> export_to_video(video, "output.mp4", fps=8)
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
|
||||
def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
|
||||
tw = tgt_width
|
||||
th = tgt_height
|
||||
h, w = src
|
||||
r = h / w
|
||||
if r > (th / tw):
|
||||
resize_height = th
|
||||
resize_width = int(round(th / h * w))
|
||||
else:
|
||||
resize_width = tw
|
||||
resize_height = int(round(tw / w * h))
|
||||
|
||||
crop_top = int(round((th - resize_height) / 2.0))
|
||||
crop_left = int(round((tw - resize_width) / 2.0))
|
||||
|
||||
return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
||||
def retrieve_timesteps(
|
||||
scheduler,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
||||
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
||||
|
||||
Args:
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
||||
`num_inference_steps` and `sigmas` must be `None`.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
||||
`num_inference_steps` and `timesteps` must be `None`.
|
||||
|
||||
Returns:
|
||||
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
||||
second element is the number of inference steps.
|
||||
"""
|
||||
if timesteps is not None and sigmas is not None:
|
||||
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
||||
if timesteps is not None:
|
||||
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accepts_timesteps:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" timestep schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
elif sigmas is not None:
|
||||
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accept_sigmas:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
else:
|
||||
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
||||
def retrieve_latents(
|
||||
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
||||
):
|
||||
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
|
||||
return encoder_output.latent_dist.sample(generator)
|
||||
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
|
||||
return encoder_output.latent_dist.mode()
|
||||
elif hasattr(encoder_output, "latents"):
|
||||
return encoder_output.latents
|
||||
else:
|
||||
raise AttributeError("Could not access latents of provided encoder_output")
|
||||
|
||||
|
||||
class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for video-to-video generation using CogVideoX.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
||||
text_encoder ([`T5EncoderModel`]):
|
||||
Frozen text-encoder. CogVideoX uses
|
||||
[T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
|
||||
[t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
|
||||
tokenizer (`T5Tokenizer`):
|
||||
Tokenizer of class
|
||||
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
||||
transformer ([`CogVideoXTransformer3DModel`]):
|
||||
A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
|
||||
"""
|
||||
|
||||
_optional_components = []
|
||||
model_cpu_offload_seq = "text_encoder->transformer->vae"
|
||||
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: T5Tokenizer,
|
||||
text_encoder: T5EncoderModel,
|
||||
vae: AutoencoderKLCogVideoX,
|
||||
transformer: CogVideoXTransformer3DModel,
|
||||
scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(
|
||||
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
|
||||
)
|
||||
self.vae_scale_factor_spatial = (
|
||||
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
||||
)
|
||||
self.vae_scale_factor_temporal = (
|
||||
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
||||
)
|
||||
|
||||
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
|
||||
def _get_t5_prompt_embeds(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
num_videos_per_prompt: int = 1,
|
||||
max_sequence_length: int = 226,
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=max_sequence_length,
|
||||
truncation=True,
|
||||
add_special_tokens=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
||||
|
||||
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
|
||||
removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because `max_sequence_length` is set to "
|
||||
f" {max_sequence_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
|
||||
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
_, seq_len, _ = prompt_embeds.shape
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
|
||||
prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
|
||||
|
||||
return prompt_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
do_classifier_free_guidance: bool = True,
|
||||
num_videos_per_prompt: int = 1,
|
||||
prompt_embeds: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
max_sequence_length: int = 226,
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
r"""
|
||||
Encodes the prompt into text encoder hidden states.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
prompt to be encoded
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
less than `1`).
|
||||
do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use classifier free guidance or not.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
||||
argument.
|
||||
device: (`torch.device`, *optional*):
|
||||
torch device
|
||||
dtype: (`torch.dtype`, *optional*):
|
||||
torch dtype
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
if prompt is not None:
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
prompt_embeds = self._get_t5_prompt_embeds(
|
||||
prompt=prompt,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
||||
negative_prompt = negative_prompt or ""
|
||||
negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
|
||||
|
||||
if prompt is not None and type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}."
|
||||
)
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
|
||||
negative_prompt_embeds = self._get_t5_prompt_embeds(
|
||||
prompt=negative_prompt,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
return prompt_embeds, negative_prompt_embeds
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
video: Optional[torch.Tensor] = None,
|
||||
batch_size: int = 1,
|
||||
num_channels_latents: int = 16,
|
||||
height: int = 60,
|
||||
width: int = 90,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
timestep: Optional[torch.Tensor] = None,
|
||||
):
|
||||
num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
num_frames,
|
||||
num_channels_latents,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
)
|
||||
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
if latents is None:
|
||||
if isinstance(generator, list):
|
||||
if len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
init_latents = [
|
||||
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
|
||||
]
|
||||
else:
|
||||
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
|
||||
|
||||
init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
|
||||
init_latents = self.vae.config.scaling_factor * init_latents
|
||||
|
||||
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
||||
else:
|
||||
latents = latents.to(device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
|
||||
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
||||
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
||||
latents = 1 / self.vae.config.scaling_factor * latents
|
||||
|
||||
frames = self.vae.decode(latents).sample
|
||||
return frames
|
||||
|
||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
|
||||
def get_timesteps(self, num_inference_steps, timesteps, strength, device):
|
||||
# get the original timestep using init_timestep
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
timesteps = timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
||||
def prepare_extra_step_kwargs(self, generator, eta):
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
# check if the scheduler accepts generator
|
||||
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
if accepts_generator:
|
||||
extra_step_kwargs["generator"] = generator
|
||||
return extra_step_kwargs
|
||||
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
height,
|
||||
width,
|
||||
strength,
|
||||
negative_prompt,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
video=None,
|
||||
latents=None,
|
||||
prompt_embeds=None,
|
||||
negative_prompt_embeds=None,
|
||||
):
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if strength < 0 or strength > 1:
|
||||
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
||||
|
||||
if callback_on_step_end_tensor_inputs is not None and not all(
|
||||
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
|
||||
)
|
||||
if prompt is not None and prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
||||
" only forward one of the two."
|
||||
)
|
||||
elif prompt is None and prompt_embeds is None:
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
|
||||
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
||||
)
|
||||
|
||||
if negative_prompt is not None and negative_prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
||||
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
||||
)
|
||||
|
||||
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
||||
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
||||
raise ValueError(
|
||||
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
||||
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
||||
f" {negative_prompt_embeds.shape}."
|
||||
)
|
||||
|
||||
if video is not None and latents is not None:
|
||||
raise ValueError("Only one of `video` or `latents` should be provided")
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self) -> None:
|
||||
r"""Enables fused QKV projections."""
|
||||
self.fusing_transformer = True
|
||||
self.transformer.fuse_qkv_projections()
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
|
||||
def unfuse_qkv_projections(self) -> None:
|
||||
r"""Disable QKV projection fusion if enabled."""
|
||||
if not self.fusing_transformer:
|
||||
logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
|
||||
else:
|
||||
self.transformer.unfuse_qkv_projections()
|
||||
self.fusing_transformer = False
|
||||
|
||||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
|
||||
def _prepare_rotary_positional_embeddings(
|
||||
self,
|
||||
height: int,
|
||||
width: int,
|
||||
num_frames: int,
|
||||
device: torch.device,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||
base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||
base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||
|
||||
grid_crops_coords = get_resize_crop_region_for_grid(
|
||||
(grid_height, grid_width), base_size_width, base_size_height
|
||||
)
|
||||
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
||||
embed_dim=self.transformer.config.attention_head_dim,
|
||||
crops_coords=grid_crops_coords,
|
||||
grid_size=(grid_height, grid_width),
|
||||
temporal_size=num_frames,
|
||||
)
|
||||
|
||||
freqs_cos = freqs_cos.to(device=device)
|
||||
freqs_sin = freqs_sin.to(device=device)
|
||||
return freqs_cos, freqs_sin
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
video: List[Image.Image] = None,
|
||||
prompt: Optional[Union[str, List[str]]] = None,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: int = 480,
|
||||
width: int = 720,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
strength: float = 0.8,
|
||||
guidance_scale: float = 6,
|
||||
use_dynamic_cfg: bool = False,
|
||||
num_videos_per_prompt: int = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
output_type: str = "pil",
|
||||
return_dict: bool = True,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 226,
|
||||
) -> Union[CogVideoXPipelineOutput, Tuple]:
|
||||
"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
video (`List[PIL.Image.Image]`):
|
||||
The input video to condition the generation on. Must be a list of images/frames of the video.
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
||||
instead.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
less than `1`).
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Higher strength leads to more differences between original video and generated video.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
||||
to make generation deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
||||
argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
||||
of a plain tuple.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
max_sequence_length (`int`, defaults to `226`):
|
||||
Maximum sequence length in encoded prompt. Must be consistent with
|
||||
`self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
|
||||
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
||||
"""
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
num_videos_per_prompt = 1
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
height,
|
||||
width,
|
||||
strength,
|
||||
negative_prompt,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
)
|
||||
self._guidance_scale = guidance_scale
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Default call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
device = self._execution_device
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
|
||||
# 3. Encode input prompt
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
negative_prompt,
|
||||
do_classifier_free_guidance,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
)
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
||||
|
||||
# 4. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
||||
self._num_timesteps = len(timesteps)
|
||||
|
||||
# 5. Prepare latents
|
||||
if latents is None:
|
||||
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
||||
video = video.to(device=device, dtype=prompt_embeds.dtype)
|
||||
|
||||
latent_channels = self.transformer.config.in_channels
|
||||
latents = self.prepare_latents(
|
||||
video,
|
||||
batch_size * num_videos_per_prompt,
|
||||
latent_channels,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
latents,
|
||||
latent_timestep,
|
||||
)
|
||||
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 7. Create rotary embeds if required
|
||||
image_rotary_emb = (
|
||||
self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
|
||||
if self.transformer.config.use_rotary_positional_embeddings
|
||||
else None
|
||||
)
|
||||
|
||||
# 8. Denoising loop
|
||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
# for DPM-solver++
|
||||
old_pred_original_sample = None
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
timestep = t.expand(latent_model_input.shape[0])
|
||||
|
||||
# predict noise model_output
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=latent_model_input,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep=timestep,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
noise_pred = noise_pred.float()
|
||||
|
||||
# perform guidance
|
||||
if use_dynamic_cfg:
|
||||
self._guidance_scale = 1 + guidance_scale * (
|
||||
(1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
|
||||
)
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
else:
|
||||
latents, old_pred_original_sample = self.scheduler.step(
|
||||
noise_pred,
|
||||
old_pred_original_sample,
|
||||
t,
|
||||
timesteps[i - 1] if i > 0 else None,
|
||||
latents,
|
||||
**extra_step_kwargs,
|
||||
return_dict=False,
|
||||
)
|
||||
latents = latents.to(prompt_embeds.dtype)
|
||||
|
||||
# call the callback, if provided
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
if not output_type == "latent":
|
||||
video = self.decode_latents(latents)
|
||||
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
|
||||
else:
|
||||
video = latents
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (video,)
|
||||
|
||||
return CogVideoXPipelineOutput(frames=video)
|
||||
@@ -1,20 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.utils import BaseOutput
|
||||
|
||||
|
||||
@dataclass
|
||||
class CogVideoXPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for CogVideo pipelines.
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
||||
`(batch_size, num_frames, channels, height, width)`.
|
||||
"""
|
||||
|
||||
frames: torch.Tensor
|
||||
@@ -1024,16 +1024,14 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -1041,7 +1039,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -1052,12 +1050,11 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def _get_add_time_ids(
|
||||
self,
|
||||
original_size,
|
||||
|
||||
@@ -546,7 +546,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
)
|
||||
elif encoder_hid_dim_type is not None:
|
||||
raise ValueError(
|
||||
f"`encoder_hid_dim_type`: {encoder_hid_dim_type} must be None, 'text_proj', 'text_image_proj' or 'image_proj'."
|
||||
f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
|
||||
)
|
||||
else:
|
||||
self.encoder_hid_proj = None
|
||||
|
||||
@@ -24,8 +24,6 @@ except OptionalDependencyNotAvailable:
|
||||
else:
|
||||
_import_structure["pipeline_flux"] = ["FluxPipeline"]
|
||||
_import_structure["pipeline_flux_controlnet"] = ["FluxControlNetPipeline"]
|
||||
_import_structure["pipeline_flux_img2img"] = ["FluxImg2ImgPipeline"]
|
||||
_import_structure["pipeline_flux_inpaint"] = ["FluxInpaintPipeline"]
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
if not (is_transformers_available() and is_torch_available()):
|
||||
@@ -35,8 +33,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
else:
|
||||
from .pipeline_flux import FluxPipeline
|
||||
from .pipeline_flux_controlnet import FluxControlNetPipeline
|
||||
from .pipeline_flux_img2img import FluxImg2ImgPipeline
|
||||
from .pipeline_flux_inpaint import FluxInpaintPipeline
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -280,7 +280,7 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
||||
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
||||
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
|
||||
|
||||
return prompt_embeds
|
||||
@@ -536,7 +536,7 @@ class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 28,
|
||||
timesteps: List[int] = None,
|
||||
guidance_scale: float = 3.5,
|
||||
guidance_scale: float = 7.0,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -27,7 +27,7 @@ from transformers import (
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
|
||||
from ...models.autoencoders import AutoencoderKL
|
||||
from ...models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
|
||||
from ...models.controlnet_flux import FluxControlNetModel
|
||||
from ...models.transformers import FluxTransformer2DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from ...utils import (
|
||||
@@ -61,7 +61,7 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers import FluxControlNetPipeline
|
||||
>>> from diffusers import FluxControlNetModel
|
||||
|
||||
>>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
|
||||
>>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny-alpha"
|
||||
>>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
|
||||
>>> pipe = FluxControlNetPipeline.from_pretrained(
|
||||
... base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
|
||||
@@ -195,9 +195,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
text_encoder_2: T5EncoderModel,
|
||||
tokenizer_2: T5TokenizerFast,
|
||||
transformer: FluxTransformer2DModel,
|
||||
controlnet: Union[
|
||||
FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
|
||||
],
|
||||
controlnet: FluxControlNetModel,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -302,7 +300,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
||||
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
|
||||
|
||||
return prompt_embeds
|
||||
@@ -573,7 +571,6 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
timesteps: List[int] = None,
|
||||
guidance_scale: float = 7.0,
|
||||
control_image: PipelineImageInput = None,
|
||||
control_mode: Optional[Union[int, List[int]]] = None,
|
||||
controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
@@ -614,20 +611,6 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
|
||||
`List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
|
||||
The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
|
||||
specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
|
||||
as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
|
||||
width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
|
||||
images must be passed as a list such that each element of the list can be correctly batched for input
|
||||
to a single ControlNet.
|
||||
controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
|
||||
The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
|
||||
to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
|
||||
the corresponding scale as a list.
|
||||
control_mode (`int` or `List[int]`,, *optional*, defaults to None):
|
||||
The control mode when applying ControlNet-Union.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -747,55 +730,6 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
width_control_image,
|
||||
)
|
||||
|
||||
# set control mode
|
||||
if control_mode is not None:
|
||||
control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
|
||||
control_mode = control_mode.reshape([-1, 1])
|
||||
|
||||
elif isinstance(self.controlnet, FluxMultiControlNetModel):
|
||||
control_images = []
|
||||
|
||||
for control_image_ in control_image:
|
||||
control_image_ = self.prepare_image(
|
||||
image=control_image_,
|
||||
width=width,
|
||||
height=height,
|
||||
batch_size=batch_size * num_images_per_prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
height, width = control_image_.shape[-2:]
|
||||
|
||||
# vae encode
|
||||
control_image_ = self.vae.encode(control_image_).latent_dist.sample()
|
||||
control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
|
||||
|
||||
# pack
|
||||
height_control_image, width_control_image = control_image_.shape[2:]
|
||||
control_image_ = self._pack_latents(
|
||||
control_image_,
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height_control_image,
|
||||
width_control_image,
|
||||
)
|
||||
|
||||
control_images.append(control_image_)
|
||||
|
||||
control_image = control_images
|
||||
|
||||
# set control mode
|
||||
control_mode_ = []
|
||||
if isinstance(control_mode, list):
|
||||
for cmode in control_mode:
|
||||
if cmode is None:
|
||||
control_mode_.append(-1)
|
||||
else:
|
||||
control_mode_.append(cmode)
|
||||
control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
|
||||
control_mode = control_mode.reshape([-1, 1])
|
||||
|
||||
# 4. Prepare latent variables
|
||||
num_channels_latents = self.transformer.config.in_channels // 4
|
||||
latents, latent_image_ids = self.prepare_latents(
|
||||
@@ -851,7 +785,6 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
||||
controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
|
||||
hidden_states=latents,
|
||||
controlnet_cond=control_image,
|
||||
controlnet_mode=control_mode,
|
||||
conditioning_scale=controlnet_conditioning_scale,
|
||||
timestep=timestep / 1000,
|
||||
guidance=guidance,
|
||||
|
||||
@@ -1,844 +0,0 @@
|
||||
# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
||||
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FluxLoraLoaderMixin
|
||||
from ...models.autoencoders import AutoencoderKL
|
||||
from ...models.transformers import FluxTransformer2DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from ...utils import (
|
||||
USE_PEFT_BACKEND,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
scale_lora_layers,
|
||||
unscale_lora_layers,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ..pipeline_utils import DiffusionPipeline
|
||||
from .pipeline_output import FluxPipelineOutput
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```py
|
||||
>>> import torch
|
||||
|
||||
>>> from diffusers import FluxImg2ImgPipeline
|
||||
>>> from diffusers.utils import load_image
|
||||
|
||||
>>> device = "cuda"
|
||||
>>> pipe = FluxImg2ImgPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
|
||||
>>> pipe = pipe.to(device)
|
||||
|
||||
>>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
>>> init_image = load_image(url).resize((1024, 1024))
|
||||
|
||||
>>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
|
||||
|
||||
>>> images = pipe(
|
||||
... prompt=prompt, image=init_image, num_inference_steps=4, strength=0.95, guidance_scale=0.0
|
||||
... ).images[0]
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
||||
def calculate_shift(
|
||||
image_seq_len,
|
||||
base_seq_len: int = 256,
|
||||
max_seq_len: int = 4096,
|
||||
base_shift: float = 0.5,
|
||||
max_shift: float = 1.16,
|
||||
):
|
||||
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
||||
b = base_shift - m * base_seq_len
|
||||
mu = image_seq_len * m + b
|
||||
return mu
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
||||
def retrieve_latents(
|
||||
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
||||
):
|
||||
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
|
||||
return encoder_output.latent_dist.sample(generator)
|
||||
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
|
||||
return encoder_output.latent_dist.mode()
|
||||
elif hasattr(encoder_output, "latents"):
|
||||
return encoder_output.latents
|
||||
else:
|
||||
raise AttributeError("Could not access latents of provided encoder_output")
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
||||
def retrieve_timesteps(
|
||||
scheduler,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
||||
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
||||
|
||||
Args:
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
||||
`num_inference_steps` and `sigmas` must be `None`.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
||||
`num_inference_steps` and `timesteps` must be `None`.
|
||||
|
||||
Returns:
|
||||
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
||||
second element is the number of inference steps.
|
||||
"""
|
||||
if timesteps is not None and sigmas is not None:
|
||||
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
||||
if timesteps is not None:
|
||||
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accepts_timesteps:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" timestep schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
elif sigmas is not None:
|
||||
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
||||
if not accept_sigmas:
|
||||
raise ValueError(
|
||||
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
||||
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
||||
)
|
||||
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
num_inference_steps = len(timesteps)
|
||||
else:
|
||||
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
||||
timesteps = scheduler.timesteps
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
||||
r"""
|
||||
The Flux pipeline for image inpainting.
|
||||
|
||||
Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
|
||||
|
||||
Args:
|
||||
transformer ([`FluxTransformer2DModel`]):
|
||||
Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
|
||||
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
||||
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
text_encoder_2 ([`T5EncoderModel`]):
|
||||
[T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
|
||||
the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
tokenizer_2 (`T5TokenizerFast`):
|
||||
Second Tokenizer of class
|
||||
[T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
||||
_optional_components = []
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: FlowMatchEulerDiscreteScheduler,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
text_encoder_2: T5EncoderModel,
|
||||
tokenizer_2: T5TokenizerFast,
|
||||
transformer: FluxTransformer2DModel,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
text_encoder_2=text_encoder_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenizer_2=tokenizer_2,
|
||||
transformer=transformer,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
self.vae_scale_factor = (
|
||||
2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
|
||||
)
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
self.tokenizer_max_length = (
|
||||
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
||||
)
|
||||
self.default_sample_size = 64
|
||||
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
|
||||
def _get_t5_prompt_embeds(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
num_images_per_prompt: int = 1,
|
||||
max_sequence_length: int = 512,
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
|
||||
text_inputs = self.tokenizer_2(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=max_sequence_length,
|
||||
truncation=True,
|
||||
return_length=False,
|
||||
return_overflowing_tokens=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
|
||||
|
||||
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
|
||||
removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because `max_sequence_length` is set to "
|
||||
f" {max_sequence_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
|
||||
|
||||
dtype = self.text_encoder_2.dtype
|
||||
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
|
||||
|
||||
_, seq_len, _ = prompt_embeds.shape
|
||||
|
||||
# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
||||
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
return prompt_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
|
||||
def _get_clip_prompt_embeds(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
num_images_per_prompt: int = 1,
|
||||
device: Optional[torch.device] = None,
|
||||
):
|
||||
device = device or self._execution_device
|
||||
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
batch_size = len(prompt)
|
||||
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer_max_length,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=False,
|
||||
return_length=False,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
text_input_ids = text_inputs.input_ids
|
||||
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
||||
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
|
||||
removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer_max_length} tokens: {removed_text}"
|
||||
)
|
||||
prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
|
||||
|
||||
# Use pooled output of CLIPTextModel
|
||||
prompt_embeds = prompt_embeds.pooler_output
|
||||
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
|
||||
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
|
||||
|
||||
return prompt_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
prompt_2: Union[str, List[str]],
|
||||
device: Optional[torch.device] = None,
|
||||
num_images_per_prompt: int = 1,
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
max_sequence_length: int = 512,
|
||||
lora_scale: Optional[float] = None,
|
||||
):
|
||||
r"""
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
prompt to be encoded
|
||||
prompt_2 (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
||||
used in all text-encoders
|
||||
device: (`torch.device`):
|
||||
torch device
|
||||
num_images_per_prompt (`int`):
|
||||
number of images that should be generated per prompt
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
||||
lora_scale (`float`, *optional*):
|
||||
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
|
||||
# set lora scale so that monkey patched LoRA
|
||||
# function of text encoder can correctly access it
|
||||
if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
|
||||
self._lora_scale = lora_scale
|
||||
|
||||
# dynamically adjust the LoRA scale
|
||||
if self.text_encoder is not None and USE_PEFT_BACKEND:
|
||||
scale_lora_layers(self.text_encoder, lora_scale)
|
||||
if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
|
||||
scale_lora_layers(self.text_encoder_2, lora_scale)
|
||||
|
||||
prompt = [prompt] if isinstance(prompt, str) else prompt
|
||||
|
||||
if prompt_embeds is None:
|
||||
prompt_2 = prompt_2 or prompt
|
||||
prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
|
||||
|
||||
# We only use the pooled prompt output from the CLIPTextModel
|
||||
pooled_prompt_embeds = self._get_clip_prompt_embeds(
|
||||
prompt=prompt,
|
||||
device=device,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
)
|
||||
prompt_embeds = self._get_t5_prompt_embeds(
|
||||
prompt=prompt_2,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
)
|
||||
|
||||
if self.text_encoder is not None:
|
||||
if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
|
||||
# Retrieve the original scale by scaling back the LoRA layers
|
||||
unscale_lora_layers(self.text_encoder, lora_scale)
|
||||
|
||||
if self.text_encoder_2 is not None:
|
||||
if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
|
||||
# Retrieve the original scale by scaling back the LoRA layers
|
||||
unscale_lora_layers(self.text_encoder_2, lora_scale)
|
||||
|
||||
dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
|
||||
text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
|
||||
|
||||
return prompt_embeds, pooled_prompt_embeds, text_ids
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
|
||||
def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
|
||||
if isinstance(generator, list):
|
||||
image_latents = [
|
||||
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
|
||||
for i in range(image.shape[0])
|
||||
]
|
||||
image_latents = torch.cat(image_latents, dim=0)
|
||||
else:
|
||||
image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
|
||||
|
||||
image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
|
||||
|
||||
return image_latents
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
|
||||
def get_timesteps(self, num_inference_steps, strength, device):
|
||||
# get the original timestep using init_timestep
|
||||
init_timestep = min(num_inference_steps * strength, num_inference_steps)
|
||||
|
||||
t_start = int(max(num_inference_steps - init_timestep, 0))
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
prompt_2,
|
||||
strength,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds=None,
|
||||
pooled_prompt_embeds=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
max_sequence_length=None,
|
||||
):
|
||||
if strength < 0 or strength > 1:
|
||||
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if callback_on_step_end_tensor_inputs is not None and not all(
|
||||
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
|
||||
)
|
||||
|
||||
if prompt is not None and prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
||||
" only forward one of the two."
|
||||
)
|
||||
elif prompt_2 is not None and prompt_embeds is not None:
|
||||
raise ValueError(
|
||||
f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
||||
" only forward one of the two."
|
||||
)
|
||||
elif prompt is None and prompt_embeds is None:
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
||||
)
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
|
||||
raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
|
||||
|
||||
if prompt_embeds is not None and pooled_prompt_embeds is None:
|
||||
raise ValueError(
|
||||
"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
|
||||
)
|
||||
|
||||
if max_sequence_length is not None and max_sequence_length > 512:
|
||||
raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
|
||||
|
||||
@staticmethod
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
|
||||
def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
||||
latent_image_ids = torch.zeros(height // 2, width // 2, 3)
|
||||
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
|
||||
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
|
||||
|
||||
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
|
||||
|
||||
latent_image_ids = latent_image_ids.reshape(
|
||||
latent_image_id_height * latent_image_id_width, latent_image_id_channels
|
||||
)
|
||||
|
||||
return latent_image_ids.to(device=device, dtype=dtype)
|
||||
|
||||
@staticmethod
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
|
||||
def _pack_latents(latents, batch_size, num_channels_latents, height, width):
|
||||
latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
|
||||
latents = latents.permute(0, 2, 4, 1, 3, 5)
|
||||
latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
|
||||
|
||||
return latents
|
||||
|
||||
@staticmethod
|
||||
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
|
||||
def _unpack_latents(latents, height, width, vae_scale_factor):
|
||||
batch_size, num_patches, channels = latents.shape
|
||||
|
||||
height = height // vae_scale_factor
|
||||
width = width // vae_scale_factor
|
||||
|
||||
latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
|
||||
latents = latents.permute(0, 3, 1, 4, 2, 5)
|
||||
|
||||
latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
|
||||
|
||||
return latents
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
image,
|
||||
timestep,
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
height,
|
||||
width,
|
||||
dtype,
|
||||
device,
|
||||
generator,
|
||||
latents=None,
|
||||
):
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
height = 2 * (int(height) // self.vae_scale_factor)
|
||||
width = 2 * (int(width) // self.vae_scale_factor)
|
||||
|
||||
shape = (batch_size, num_channels_latents, height, width)
|
||||
latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
|
||||
|
||||
if latents is not None:
|
||||
return latents.to(device=device, dtype=dtype), latent_image_ids
|
||||
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
image_latents = self._encode_vae_image(image=image, generator=generator)
|
||||
if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
|
||||
# expand init_latents for batch_size
|
||||
additional_image_per_prompt = batch_size // image_latents.shape[0]
|
||||
image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
|
||||
elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
image_latents = torch.cat([image_latents], dim=0)
|
||||
|
||||
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
latents = self.scheduler.scale_noise(image_latents, timestep, noise)
|
||||
latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
|
||||
return latents, latent_image_ids
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def joint_attention_kwargs(self):
|
||||
return self._joint_attention_kwargs
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
prompt_2: Optional[Union[str, List[str]]] = None,
|
||||
image: PipelineImageInput = None,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
strength: float = 0.6,
|
||||
num_inference_steps: int = 28,
|
||||
timesteps: List[int] = None,
|
||||
guidance_scale: float = 7.0,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 512,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
||||
instead.
|
||||
prompt_2 (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
||||
will be used instead
|
||||
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
||||
`Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
|
||||
numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
|
||||
or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
|
||||
list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
|
||||
latents as `image`, but if passing latents directly it is not encoded again.
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
strength (`float`, *optional*, defaults to 1.0):
|
||||
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
|
||||
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
|
||||
on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
|
||||
process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
|
||||
essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
||||
to make generation deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
|
||||
joint_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
||||
`self.processor` in
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
|
||||
is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
|
||||
images.
|
||||
"""
|
||||
|
||||
height = height or self.default_sample_size * self.vae_scale_factor
|
||||
width = width or self.default_sample_size * self.vae_scale_factor
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
prompt_2,
|
||||
strength,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds=prompt_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._joint_attention_kwargs = joint_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Preprocess image
|
||||
init_image = self.image_processor.preprocess(image, height=height, width=width)
|
||||
init_image = init_image.to(dtype=torch.float32)
|
||||
|
||||
# 3. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
device = self._execution_device
|
||||
|
||||
lora_scale = (
|
||||
self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
|
||||
)
|
||||
(
|
||||
prompt_embeds,
|
||||
pooled_prompt_embeds,
|
||||
text_ids,
|
||||
) = self.encode_prompt(
|
||||
prompt=prompt,
|
||||
prompt_2=prompt_2,
|
||||
prompt_embeds=prompt_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
device=device,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
lora_scale=lora_scale,
|
||||
)
|
||||
|
||||
# 4.Prepare timesteps
|
||||
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
|
||||
image_seq_len = (int(height) // self.vae_scale_factor) * (int(width) // self.vae_scale_factor)
|
||||
mu = calculate_shift(
|
||||
image_seq_len,
|
||||
self.scheduler.config.base_image_seq_len,
|
||||
self.scheduler.config.max_image_seq_len,
|
||||
self.scheduler.config.base_shift,
|
||||
self.scheduler.config.max_shift,
|
||||
)
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler,
|
||||
num_inference_steps,
|
||||
device,
|
||||
timesteps,
|
||||
sigmas,
|
||||
mu=mu,
|
||||
)
|
||||
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
||||
|
||||
if num_inference_steps < 1:
|
||||
raise ValueError(
|
||||
f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
|
||||
f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
|
||||
)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
||||
|
||||
# 5. Prepare latent variables
|
||||
num_channels_latents = self.transformer.config.in_channels // 4
|
||||
|
||||
latents, latent_image_ids = self.prepare_latents(
|
||||
init_image,
|
||||
latent_timestep,
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
latents,
|
||||
)
|
||||
|
||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||
self._num_timesteps = len(timesteps)
|
||||
|
||||
# handle guidance
|
||||
if self.transformer.config.guidance_embeds:
|
||||
guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
|
||||
guidance = guidance.expand(latents.shape[0])
|
||||
else:
|
||||
guidance = None
|
||||
|
||||
# 6. Denoising loop
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=latents,
|
||||
timestep=timestep / 1000,
|
||||
guidance=guidance,
|
||||
pooled_projections=pooled_prompt_embeds,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
txt_ids=text_ids,
|
||||
img_ids=latent_image_ids,
|
||||
joint_attention_kwargs=self.joint_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_dtype = latents.dtype
|
||||
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
||||
|
||||
if latents.dtype != latents_dtype:
|
||||
if torch.backends.mps.is_available():
|
||||
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
||||
latents = latents.to(latents_dtype)
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
if output_type == "latent":
|
||||
image = latents
|
||||
|
||||
else:
|
||||
latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
|
||||
latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
|
||||
image = self.vae.decode(latents, return_dict=False)[0]
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
return FluxPipelineOutput(images=image)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -12,24 +12,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import collections
|
||||
import functools
|
||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ..models.attention import BasicTransformerBlock, FreeNoiseTransformerBlock
|
||||
from ..models.embeddings import CogVideoXPatchEmbed, FreeNoiseCogVideoXPatchEmbed
|
||||
from ..models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
|
||||
from ..models.transformers.cogvideox_transformer_3d import (
|
||||
CogVideoXBlock,
|
||||
CogVideoXTransformer3DModel,
|
||||
FreeNoiseCogVideoXBlock,
|
||||
)
|
||||
from ..models.transformers.transformer_2d import Transformer2DModel
|
||||
from ..models.unets.unet_motion_model import (
|
||||
AnimateDiffTransformer3D,
|
||||
CrossAttnDownBlockMotion,
|
||||
DownBlockMotion,
|
||||
UpBlockMotion,
|
||||
@@ -41,166 +29,8 @@ from ..utils.torch_utils import randn_tensor
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def _lerp(start_index: int, end_index: int, start_tensor: torch.Tensor, end_tensor: torch.Tensor) -> torch.Tensor:
|
||||
num_indices = end_index - start_index + 1
|
||||
interpolated_tensors = []
|
||||
|
||||
for i in range(num_indices):
|
||||
alpha = i / (num_indices - 1)
|
||||
interpolated_tensor = (1 - alpha) * start_tensor + alpha * end_tensor
|
||||
interpolated_tensors.append(interpolated_tensor)
|
||||
|
||||
interpolated_tensors = torch.cat(interpolated_tensors)
|
||||
return interpolated_tensors
|
||||
|
||||
|
||||
def _weighted_pooling(
|
||||
hidden_states: List[torch.Tensor], pooling_type: str = "inverse_distance", decay_rate: float = 0.9
|
||||
) -> torch.Tensor:
|
||||
length = len(hidden_states)
|
||||
|
||||
if length == 0:
|
||||
raise ValueError("This method cannot be called with an empty list.")
|
||||
if length == 1:
|
||||
return hidden_states[0]
|
||||
|
||||
if pooling_type == "average":
|
||||
weights = [1] * length
|
||||
elif pooling_type == "exponential_decay":
|
||||
weights = [decay_rate**i for i in range(length)]
|
||||
elif pooling_type == "reverse_exponential_decay":
|
||||
weights = [decay_rate**i for i in range(length)][::-1]
|
||||
elif pooling_type == "distance":
|
||||
weights = list(range(1, length + 1))
|
||||
elif pooling_type == "reverse_distance":
|
||||
weights = list(range(1, length + 1))[::-1]
|
||||
elif pooling_type == "pyramid":
|
||||
if length % 2 == 0:
|
||||
weights = list(range(1, length // 2 + 1))
|
||||
weights = weights + weights[::-1]
|
||||
else:
|
||||
weights = list(range(1, length // 2 + 1))
|
||||
weights = weights + [length // 2] + weights[::-1]
|
||||
else:
|
||||
raise ValueError("Invalid weighted pooling type.")
|
||||
|
||||
weights = torch.tensor(weights, device=hidden_states[0].device, dtype=hidden_states[0].dtype)
|
||||
weights = weights / weights.sum()
|
||||
|
||||
pooled_embeds = sum(weight * hidden_state for weight, hidden_state in zip(weights, hidden_states))
|
||||
return pooled_embeds
|
||||
|
||||
|
||||
class SplitInferenceModule(nn.Module):
|
||||
r"""
|
||||
A wrapper module class that splits inputs along a specified dimension before performing a forward pass.
|
||||
|
||||
This module is useful when you need to perform inference on large tensors in a memory-efficient way by breaking
|
||||
them into smaller chunks, processing each chunk separately, and then reassembling the results.
|
||||
|
||||
Args:
|
||||
module (`nn.Module`):
|
||||
The underlying PyTorch module that will be applied to each chunk of split inputs.
|
||||
split_size (`int`, defaults to `1`):
|
||||
The size of each chunk after splitting the input tensor.
|
||||
split_dim (`int`, defaults to `0`):
|
||||
The dimension along which the input tensors are split.
|
||||
input_kwargs_to_split (`List[str]`, defaults to `["hidden_states"]`):
|
||||
A list of keyword arguments (strings) that represent the input tensors to be split.
|
||||
|
||||
Workflow:
|
||||
1. The keyword arguments specified in `input_kwargs_to_split` are split into smaller chunks using
|
||||
`torch.split()` along the dimension `split_dim` and with a chunk size of `split_size`.
|
||||
2. The `module` is invoked once for each split with both the split inputs and any unchanged arguments
|
||||
that were passed.
|
||||
3. The output tensors from each split are concatenated back together along `split_dim` before returning.
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> import torch
|
||||
>>> import torch.nn as nn
|
||||
|
||||
>>> model = nn.Linear(1000, 1000)
|
||||
>>> split_module = SplitInferenceModule(model, split_size=2, split_dim=0, input_kwargs_to_split=["input"])
|
||||
|
||||
>>> input_tensor = torch.randn(42, 1000)
|
||||
>>> # Will split the tensor into 21 slices of shape [2, 1000].
|
||||
>>> output = split_module(input=input_tensor)
|
||||
```
|
||||
|
||||
It is also possible to nest `SplitInferenceModule` across different split dimensions for more complex
|
||||
multi-dimensional splitting.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
module: nn.Module,
|
||||
split_size: int = 1,
|
||||
split_dim: int = 0,
|
||||
input_kwargs_to_split: List[str] = ["hidden_states"],
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.module = module
|
||||
self.split_size = split_size
|
||||
self.split_dim = split_dim
|
||||
self.input_kwargs_to_split = set(input_kwargs_to_split)
|
||||
|
||||
def forward(self, *args, **kwargs) -> Union[torch.Tensor, Tuple[torch.Tensor]]:
|
||||
r"""Forward method for the `SplitInferenceModule`.
|
||||
|
||||
This method processes the input by splitting specified keyword arguments along a given dimension, running the
|
||||
underlying module on each split, and then concatenating the results. The splitting is controlled by the
|
||||
`split_size` and `split_dim` parameters specified during initialization.
|
||||
|
||||
Args:
|
||||
*args (`Any`):
|
||||
Positional arguments that are passed directly to the `module` without modification.
|
||||
**kwargs (`Dict[str, torch.Tensor]`):
|
||||
Keyword arguments passed to the underlying `module`. Only keyword arguments whose names match the
|
||||
entries in `input_kwargs_to_split` and are of type `torch.Tensor` will be split. The remaining keyword
|
||||
arguments are passed unchanged.
|
||||
|
||||
Returns:
|
||||
`Union[torch.Tensor, Tuple[torch.Tensor]]`:
|
||||
The outputs obtained from `SplitInferenceModule` are the same as if the underlying module was inferred
|
||||
without it.
|
||||
- If the underlying module returns a single tensor, the result will be a single concatenated tensor
|
||||
along the same `split_dim` after processing all splits.
|
||||
- If the underlying module returns a tuple of tensors, each element of the tuple will be concatenated
|
||||
along the `split_dim` across all splits, and the final result will be a tuple of concatenated tensors.
|
||||
"""
|
||||
split_inputs = {}
|
||||
|
||||
# 1. Split inputs that were specified during initialization and also present in passed kwargs
|
||||
for key in list(kwargs.keys()):
|
||||
if key not in self.input_kwargs_to_split or not torch.is_tensor(kwargs[key]):
|
||||
continue
|
||||
split_inputs[key] = torch.split(kwargs[key], self.split_size, self.split_dim)
|
||||
kwargs.pop(key)
|
||||
|
||||
# 2. Invoke forward pass across each split
|
||||
results = []
|
||||
for split_input in zip(*split_inputs.values()):
|
||||
inputs = dict(zip(split_inputs.keys(), split_input))
|
||||
inputs.update(kwargs)
|
||||
|
||||
intermediate_tensor_or_tensor_tuple = self.module(*args, **inputs)
|
||||
results.append(intermediate_tensor_or_tensor_tuple)
|
||||
|
||||
# 3. Concatenate split restuls to obtain final outputs
|
||||
if isinstance(results[0], torch.Tensor):
|
||||
return torch.cat(results, dim=self.split_dim)
|
||||
elif isinstance(results[0], tuple):
|
||||
return tuple([torch.cat(x, dim=self.split_dim) for x in zip(*results)])
|
||||
else:
|
||||
raise ValueError(
|
||||
"In order to use the SplitInferenceModule, it is necessary for the underlying `module` to either return a torch.Tensor or a tuple of torch.Tensor's."
|
||||
)
|
||||
|
||||
|
||||
class AnimateDiffFreeNoiseMixin:
|
||||
r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169) as used in AnimateDiff."""
|
||||
r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169)."""
|
||||
|
||||
def _enable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
|
||||
r"""Helper function to enable FreeNoise in transformer blocks."""
|
||||
@@ -239,9 +69,6 @@ class AnimateDiffFreeNoiseMixin:
|
||||
motion_module.transformer_blocks[i].load_state_dict(
|
||||
basic_transfomer_block.state_dict(), strict=True
|
||||
)
|
||||
motion_module.transformer_blocks[i].set_chunk_feed_forward(
|
||||
basic_transfomer_block._chunk_size, basic_transfomer_block._chunk_dim
|
||||
)
|
||||
|
||||
def _disable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
|
||||
r"""Helper function to disable FreeNoise in transformer blocks."""
|
||||
@@ -270,145 +97,6 @@ class AnimateDiffFreeNoiseMixin:
|
||||
motion_module.transformer_blocks[i].load_state_dict(
|
||||
free_noise_transfomer_block.state_dict(), strict=True
|
||||
)
|
||||
motion_module.transformer_blocks[i].set_chunk_feed_forward(
|
||||
free_noise_transfomer_block._chunk_size, free_noise_transfomer_block._chunk_dim
|
||||
)
|
||||
|
||||
def _check_inputs_free_noise(
|
||||
self,
|
||||
prompt,
|
||||
negative_prompt,
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
num_frames,
|
||||
) -> None:
|
||||
if not isinstance(prompt, (str, dict)):
|
||||
raise ValueError(f"Expected `prompt` to have type `str` or `dict` but found {type(prompt)=}")
|
||||
|
||||
if negative_prompt is not None:
|
||||
if not isinstance(negative_prompt, (str, dict)):
|
||||
raise ValueError(
|
||||
f"Expected `negative_prompt` to have type `str` or `dict` but found {type(negative_prompt)=}"
|
||||
)
|
||||
|
||||
if prompt_embeds is not None or negative_prompt_embeds is not None:
|
||||
raise ValueError("`prompt_embeds` and `negative_prompt_embeds` is not supported in FreeNoise yet.")
|
||||
|
||||
frame_indices = [isinstance(x, int) for x in prompt.keys()]
|
||||
frame_prompts = [isinstance(x, str) for x in prompt.values()]
|
||||
min_frame = min(list(prompt.keys()))
|
||||
max_frame = max(list(prompt.keys()))
|
||||
|
||||
if not all(frame_indices):
|
||||
raise ValueError("Expected integer keys in `prompt` dict for FreeNoise.")
|
||||
if not all(frame_prompts):
|
||||
raise ValueError("Expected str values in `prompt` dict for FreeNoise.")
|
||||
if min_frame != 0:
|
||||
raise ValueError("The minimum frame index in `prompt` dict must be 0 as a starting prompt is necessary.")
|
||||
if max_frame >= num_frames:
|
||||
raise ValueError(
|
||||
f"The maximum frame index in `prompt` dict must be lesser than {num_frames=} and follow 0-based indexing."
|
||||
)
|
||||
|
||||
def _encode_prompt_free_noise(
|
||||
self,
|
||||
prompt: Union[str, Dict[int, str]],
|
||||
num_frames: int,
|
||||
device: torch.device,
|
||||
num_videos_per_prompt: int,
|
||||
do_classifier_free_guidance: bool,
|
||||
negative_prompt: Optional[Union[str, Dict[int, str]]] = None,
|
||||
prompt_embeds: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
lora_scale: Optional[float] = None,
|
||||
clip_skip: Optional[int] = None,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
if negative_prompt is None:
|
||||
negative_prompt = ""
|
||||
|
||||
# Ensure that we have a dictionary of prompts
|
||||
if isinstance(prompt, str):
|
||||
prompt = {0: prompt}
|
||||
if isinstance(negative_prompt, str):
|
||||
negative_prompt = {0: negative_prompt}
|
||||
|
||||
self._check_inputs_free_noise(prompt, negative_prompt, prompt_embeds, negative_prompt_embeds, num_frames)
|
||||
|
||||
# Sort the prompts based on frame indices
|
||||
prompt = dict(sorted(prompt.items()))
|
||||
negative_prompt = dict(sorted(negative_prompt.items()))
|
||||
|
||||
# Ensure that we have a prompt for the last frame index
|
||||
prompt[num_frames - 1] = prompt[list(prompt.keys())[-1]]
|
||||
negative_prompt[num_frames - 1] = negative_prompt[list(negative_prompt.keys())[-1]]
|
||||
|
||||
frame_indices = list(prompt.keys())
|
||||
frame_prompts = list(prompt.values())
|
||||
frame_negative_indices = list(negative_prompt.keys())
|
||||
frame_negative_prompts = list(negative_prompt.values())
|
||||
|
||||
# Generate and interpolate positive prompts
|
||||
prompt_embeds, _ = self.encode_prompt(
|
||||
prompt=frame_prompts,
|
||||
device=device,
|
||||
num_images_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=False,
|
||||
negative_prompt=None,
|
||||
prompt_embeds=None,
|
||||
negative_prompt_embeds=None,
|
||||
lora_scale=lora_scale,
|
||||
clip_skip=clip_skip,
|
||||
)
|
||||
|
||||
shape = (num_frames, *prompt_embeds.shape[1:])
|
||||
prompt_interpolation_embeds = prompt_embeds.new_zeros(shape)
|
||||
|
||||
for i in range(len(frame_indices) - 1):
|
||||
start_frame = frame_indices[i]
|
||||
end_frame = frame_indices[i + 1]
|
||||
start_tensor = prompt_embeds[i].unsqueeze(0)
|
||||
end_tensor = prompt_embeds[i + 1].unsqueeze(0)
|
||||
|
||||
prompt_interpolation_embeds[start_frame : end_frame + 1] = self._free_noise_prompt_interpolation_callback(
|
||||
start_frame, end_frame, start_tensor, end_tensor
|
||||
)
|
||||
|
||||
# Generate and interpolate negative prompts
|
||||
negative_prompt_embeds = None
|
||||
negative_prompt_interpolation_embeds = None
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
_, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt=[""] * len(frame_negative_prompts),
|
||||
device=device,
|
||||
num_images_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=True,
|
||||
negative_prompt=frame_negative_prompts,
|
||||
prompt_embeds=None,
|
||||
negative_prompt_embeds=None,
|
||||
lora_scale=lora_scale,
|
||||
clip_skip=clip_skip,
|
||||
)
|
||||
|
||||
negative_prompt_interpolation_embeds = negative_prompt_embeds.new_zeros(shape)
|
||||
|
||||
for i in range(len(frame_negative_indices) - 1):
|
||||
start_frame = frame_negative_indices[i]
|
||||
end_frame = frame_negative_indices[i + 1]
|
||||
start_tensor = negative_prompt_embeds[i].unsqueeze(0)
|
||||
end_tensor = negative_prompt_embeds[i + 1].unsqueeze(0)
|
||||
|
||||
negative_prompt_interpolation_embeds[
|
||||
start_frame : end_frame + 1
|
||||
] = self._free_noise_prompt_interpolation_callback(start_frame, end_frame, start_tensor, end_tensor)
|
||||
|
||||
prompt_embeds = prompt_interpolation_embeds
|
||||
negative_prompt_embeds = negative_prompt_interpolation_embeds
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
return prompt_embeds, negative_prompt_embeds
|
||||
|
||||
def _prepare_latents_free_noise(
|
||||
self,
|
||||
@@ -490,7 +178,6 @@ class AnimateDiffFreeNoiseMixin:
|
||||
context_stride: int = 4,
|
||||
weighting_scheme: str = "pyramid",
|
||||
noise_type: str = "shuffle_context",
|
||||
prompt_interpolation_callback: Optional[Callable[[int, int, torch.Tensor, torch.Tensor], torch.Tensor]] = None,
|
||||
) -> None:
|
||||
r"""
|
||||
Enable long video generation using FreeNoise.
|
||||
@@ -508,27 +195,13 @@ class AnimateDiffFreeNoiseMixin:
|
||||
weighting_scheme (`str`, defaults to `pyramid`):
|
||||
Weighting scheme for averaging latents after accumulation in FreeNoise blocks. The following weighting
|
||||
schemes are supported currently:
|
||||
- "flat"
|
||||
Performs weighting averaging with a flat weight pattern: [1, 1, 1, 1, 1].
|
||||
- "pyramid"
|
||||
Performs weighted averaging with a pyramid like weight pattern: [1, 2, 3, 2, 1].
|
||||
- "delayed_reverse_sawtooth"
|
||||
Performs weighted averaging with low weights for earlier frames and high-to-low weights for
|
||||
later frames: [0.01, 0.01, 3, 2, 1].
|
||||
Peforms weighted averaging with a pyramid like weight pattern: [1, 2, 3, 2, 1].
|
||||
noise_type (`str`, defaults to "shuffle_context"):
|
||||
Must be one of ["shuffle_context", "repeat_context", "random"].
|
||||
- "shuffle_context"
|
||||
Shuffles a fixed batch of `context_length` latents to create a final latent of size
|
||||
`num_frames`. This is usually the best setting for most generation scenarious. However, there
|
||||
might be visible repetition noticeable in the kinds of motion/animation generated.
|
||||
- "repeated_context"
|
||||
Repeats a fixed batch of `context_length` latents to create a final latent of size
|
||||
`num_frames`.
|
||||
- "random"
|
||||
The final latents are random without any repetition.
|
||||
TODO
|
||||
"""
|
||||
|
||||
allowed_weighting_scheme = ["flat", "pyramid", "delayed_reverse_sawtooth"]
|
||||
allowed_weighting_scheme = ["pyramid"]
|
||||
allowed_noise_type = ["shuffle_context", "repeat_context", "random"]
|
||||
|
||||
if context_length > self.motion_adapter.config.motion_max_seq_length:
|
||||
@@ -546,491 +219,18 @@ class AnimateDiffFreeNoiseMixin:
|
||||
self._free_noise_context_stride = context_stride
|
||||
self._free_noise_weighting_scheme = weighting_scheme
|
||||
self._free_noise_noise_type = noise_type
|
||||
self._free_noise_prompt_interpolation_callback = prompt_interpolation_callback or _lerp
|
||||
|
||||
if hasattr(self.unet.mid_block, "motion_modules"):
|
||||
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
|
||||
else:
|
||||
blocks = [*self.unet.down_blocks, *self.unet.up_blocks]
|
||||
|
||||
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
|
||||
for block in blocks:
|
||||
self._enable_free_noise_in_block(block)
|
||||
|
||||
def disable_free_noise(self) -> None:
|
||||
r"""Disable the FreeNoise sampling mechanism."""
|
||||
self._free_noise_context_length = None
|
||||
|
||||
if hasattr(self.unet.mid_block, "motion_modules"):
|
||||
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
|
||||
else:
|
||||
blocks = [*self.unet.down_blocks, *self.unet.up_blocks]
|
||||
|
||||
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
|
||||
for block in blocks:
|
||||
self._disable_free_noise_in_block(block)
|
||||
|
||||
def _enable_split_inference_motion_modules_(
|
||||
self, motion_modules: List[AnimateDiffTransformer3D], spatial_split_size: int
|
||||
) -> None:
|
||||
for motion_module in motion_modules:
|
||||
motion_module.proj_in = SplitInferenceModule(motion_module.proj_in, spatial_split_size, 0, ["input"])
|
||||
|
||||
for i in range(len(motion_module.transformer_blocks)):
|
||||
motion_module.transformer_blocks[i] = SplitInferenceModule(
|
||||
motion_module.transformer_blocks[i],
|
||||
spatial_split_size,
|
||||
0,
|
||||
["hidden_states", "encoder_hidden_states"],
|
||||
)
|
||||
|
||||
motion_module.proj_out = SplitInferenceModule(motion_module.proj_out, spatial_split_size, 0, ["input"])
|
||||
|
||||
def _enable_split_inference_attentions_(
|
||||
self, attentions: List[Transformer2DModel], temporal_split_size: int
|
||||
) -> None:
|
||||
for i in range(len(attentions)):
|
||||
attentions[i] = SplitInferenceModule(
|
||||
attentions[i], temporal_split_size, 0, ["hidden_states", "encoder_hidden_states"]
|
||||
)
|
||||
|
||||
def _enable_split_inference_resnets_(self, resnets: List[ResnetBlock2D], temporal_split_size: int) -> None:
|
||||
for i in range(len(resnets)):
|
||||
resnets[i] = SplitInferenceModule(resnets[i], temporal_split_size, 0, ["input_tensor", "temb"])
|
||||
|
||||
def _enable_split_inference_samplers_(
|
||||
self, samplers: Union[List[Downsample2D], List[Upsample2D]], temporal_split_size: int
|
||||
) -> None:
|
||||
for i in range(len(samplers)):
|
||||
samplers[i] = SplitInferenceModule(samplers[i], temporal_split_size, 0, ["hidden_states"])
|
||||
|
||||
def enable_free_noise_split_inference(self, spatial_split_size: int = 256, temporal_split_size: int = 16) -> None:
|
||||
r"""
|
||||
Enable FreeNoise memory optimizations by utilizing
|
||||
[`~diffusers.pipelines.free_noise_utils.SplitInferenceModule`] across different intermediate modeling blocks.
|
||||
|
||||
Args:
|
||||
spatial_split_size (`int`, defaults to `256`):
|
||||
The split size across spatial dimensions for internal blocks. This is used in facilitating split
|
||||
inference across the effective batch dimension (`[B x H x W, F, C]`) of intermediate tensors in motion
|
||||
modeling blocks.
|
||||
temporal_split_size (`int`, defaults to `16`):
|
||||
The split size across temporal dimensions for internal blocks. This is used in facilitating split
|
||||
inference across the effective batch dimension (`[B x F, H x W, C]`) of intermediate tensors in spatial
|
||||
attention, resnets, downsampling and upsampling blocks.
|
||||
"""
|
||||
# TODO(aryan): Discuss on what's the best way to provide more control to users
|
||||
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
|
||||
for block in blocks:
|
||||
if getattr(block, "motion_modules", None) is not None:
|
||||
self._enable_split_inference_motion_modules_(block.motion_modules, spatial_split_size)
|
||||
if getattr(block, "attentions", None) is not None:
|
||||
self._enable_split_inference_attentions_(block.attentions, temporal_split_size)
|
||||
if getattr(block, "resnets", None) is not None:
|
||||
self._enable_split_inference_resnets_(block.resnets, temporal_split_size)
|
||||
if getattr(block, "downsamplers", None) is not None:
|
||||
self._enable_split_inference_samplers_(block.downsamplers, temporal_split_size)
|
||||
if getattr(block, "upsamplers", None) is not None:
|
||||
self._enable_split_inference_samplers_(block.upsamplers, temporal_split_size)
|
||||
|
||||
@property
|
||||
def free_noise_enabled(self):
|
||||
return hasattr(self, "_free_noise_context_length") and self._free_noise_context_length is not None
|
||||
|
||||
|
||||
class CogVideoXFreeNoiseMixin:
|
||||
r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169) as used in CogVideoX."""
|
||||
|
||||
def _enable_free_noise_in_block(self, transformer: CogVideoXTransformer3DModel):
|
||||
r"""Helper function to enable FreeNoise in transformer blocks."""
|
||||
patch_embed = transformer.patch_embed
|
||||
transformer.patch_embed = FreeNoiseCogVideoXPatchEmbed(
|
||||
patch_size=patch_embed.patch_size,
|
||||
in_channels=patch_embed.in_channels,
|
||||
embed_dim=patch_embed.embed_dim,
|
||||
text_embed_dim=patch_embed.text_embed_dim,
|
||||
bias=patch_embed.bias,
|
||||
sample_width=patch_embed.sample_width,
|
||||
sample_height=patch_embed.sample_height,
|
||||
sample_frames=patch_embed.sample_frames,
|
||||
temporal_compression_ratio=patch_embed.temporal_compression_ratio,
|
||||
max_text_seq_length=patch_embed.max_text_seq_length,
|
||||
spatial_interpolation_scale=patch_embed.spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=patch_embed.temporal_interpolation_scale,
|
||||
use_positional_embeddings=patch_embed.use_positional_embeddings,
|
||||
).to(device=self.device, dtype=self.dtype)
|
||||
|
||||
transformer.patch_embed.load_state_dict(patch_embed.state_dict(), strict=True)
|
||||
|
||||
for i in range(len(transformer.transformer_blocks)):
|
||||
block = transformer.transformer_blocks[i]
|
||||
|
||||
if isinstance(block, FreeNoiseCogVideoXBlock):
|
||||
block.set_free_noise_properties(
|
||||
self._free_noise_context_length,
|
||||
self._free_noise_context_stride,
|
||||
self._free_noise_weighting_scheme,
|
||||
self._free_noise_prompt_interpolation_callback,
|
||||
self._free_noise_prompt_pooling_callback,
|
||||
)
|
||||
else:
|
||||
transformer.transformer_blocks[i] = FreeNoiseCogVideoXBlock(
|
||||
dim=block.dim,
|
||||
num_attention_heads=block.num_attention_heads,
|
||||
attention_head_dim=block.attention_head_dim,
|
||||
time_embed_dim=block.time_embed_dim,
|
||||
dropout=block.dropout,
|
||||
activation_fn=block.activation_fn,
|
||||
attention_bias=block.attention_bias,
|
||||
qk_norm=block.qk_norm,
|
||||
norm_elementwise_affine=block.norm_elementwise_affine,
|
||||
norm_eps=block.norm_eps,
|
||||
final_dropout=block.final_dropout,
|
||||
ff_inner_dim=block.ff_inner_dim,
|
||||
ff_bias=block.ff_bias,
|
||||
attention_out_bias=block.attention_out_bias,
|
||||
context_length=self._free_noise_context_length,
|
||||
context_stride=self._free_noise_context_stride,
|
||||
weighting_scheme=self._free_noise_weighting_scheme,
|
||||
prompt_interpolation_callback=self._free_noise_prompt_interpolation_callback,
|
||||
prompt_pooling_callback=self._free_noise_prompt_pooling_callback,
|
||||
).to(device=self.device, dtype=self.dtype)
|
||||
|
||||
transformer.transformer_blocks[i].load_state_dict(block.state_dict(), strict=True)
|
||||
|
||||
def _disable_free_noise_in_block(self, transformer: CogVideoXTransformer3DModel):
|
||||
r"""Helper function to disable FreeNoise in transformer blocks."""
|
||||
patch_embed = transformer.patch_embed
|
||||
transformer.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_embed.patch_size,
|
||||
in_channels=patch_embed.in_channels,
|
||||
embed_dim=patch_embed.embed_dim,
|
||||
text_embed_dim=patch_embed.text_embed_dim,
|
||||
bias=patch_embed.bias,
|
||||
sample_width=patch_embed.sample_width,
|
||||
sample_height=patch_embed.sample_height,
|
||||
sample_frames=patch_embed.sample_frames,
|
||||
temporal_compression_ratio=patch_embed.temporal_compression_ratio,
|
||||
max_text_seq_length=patch_embed.max_text_seq_length,
|
||||
spatial_interpolation_scale=patch_embed.spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=patch_embed.temporal_interpolation_scale,
|
||||
use_positional_embeddings=patch_embed.use_positional_embeddings,
|
||||
).to(device=self.device, dtype=self.dtype)
|
||||
|
||||
transformer.patch_embed.load_state_dict(patch_embed.state_dict(), strict=True)
|
||||
|
||||
for i in range(len(transformer.transformer_blocks)):
|
||||
block = transformer.transformer_blocks[i]
|
||||
|
||||
if isinstance(block, FreeNoiseCogVideoXBlock):
|
||||
transformer.transformer_blocks[i] = CogVideoXBlock(
|
||||
dim=block.dim,
|
||||
num_attention_heads=block.num_attention_heads,
|
||||
attention_head_dim=block.attention_head_dim,
|
||||
time_embed_dim=block.time_embed_dim,
|
||||
dropout=block.dropout,
|
||||
activation_fn=block.activation_fn,
|
||||
attention_bias=block.attention_bias,
|
||||
qk_norm=block.qk_norm,
|
||||
norm_elementwise_affine=block.norm_elementwise_affine,
|
||||
norm_eps=block.norm_eps,
|
||||
final_dropout=block.final_dropout,
|
||||
ff_inner_dim=block.ff_inner_dim,
|
||||
ff_bias=block.ff_bias,
|
||||
attention_out_bias=block.attention_out_bias,
|
||||
).to(device=self.device, dtype=self.dtype)
|
||||
|
||||
transformer.transformer_blocks[i].load_state_dict(block.state_dict(), strict=True)
|
||||
|
||||
def _check_inputs_free_noise(
|
||||
self,
|
||||
prompt,
|
||||
negative_prompt,
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
num_frames,
|
||||
) -> None:
|
||||
if not isinstance(prompt, (str, dict)):
|
||||
raise ValueError(f"Expected `prompt` to have type `str` or `dict` but found {type(prompt)=}")
|
||||
|
||||
if negative_prompt is not None:
|
||||
if not isinstance(negative_prompt, (str, dict)):
|
||||
raise ValueError(
|
||||
f"Expected `negative_prompt` to have type `str` or `dict` but found {type(negative_prompt)=}"
|
||||
)
|
||||
|
||||
if prompt_embeds is not None or negative_prompt_embeds is not None:
|
||||
raise ValueError("`prompt_embeds` and `negative_prompt_embeds` is not supported in FreeNoise yet.")
|
||||
|
||||
frame_indices = [isinstance(x, int) for x in prompt.keys()]
|
||||
frame_prompts = [isinstance(x, str) for x in prompt.values()]
|
||||
min_frame = min(list(prompt.keys()))
|
||||
max_frame = max(list(prompt.keys()))
|
||||
|
||||
if not all(frame_indices):
|
||||
raise ValueError("Expected integer keys in `prompt` dict for FreeNoise.")
|
||||
if not all(frame_prompts):
|
||||
raise ValueError("Expected str values in `prompt` dict for FreeNoise.")
|
||||
if min_frame != 0:
|
||||
raise ValueError("The minimum frame index in `prompt` dict must be 0 as a starting prompt is necessary.")
|
||||
if max_frame >= num_frames:
|
||||
raise ValueError(
|
||||
f"The maximum frame index in `prompt` dict must be lesser than {num_frames=} and follow 0-based indexing."
|
||||
)
|
||||
|
||||
def _encode_prompt_free_noise(
|
||||
self,
|
||||
prompt: Union[str, Dict[int, str]],
|
||||
num_frames: int,
|
||||
device: torch.device,
|
||||
num_videos_per_prompt: int,
|
||||
do_classifier_free_guidance: bool,
|
||||
negative_prompt: Optional[Union[str, Dict[int, str]]] = None,
|
||||
prompt_embeds: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
max_sequence_length: Optional[int] = None,
|
||||
) -> Tuple[Dict[int, torch.Tensor], Optional[Dict[int, torch.Tensor]]]:
|
||||
if negative_prompt is None:
|
||||
negative_prompt = ""
|
||||
|
||||
# Ensure that we have a dictionary of prompts
|
||||
if isinstance(prompt, str):
|
||||
prompt = {0: prompt}
|
||||
if isinstance(negative_prompt, str):
|
||||
negative_prompt = {0: negative_prompt}
|
||||
|
||||
self._check_inputs_free_noise(prompt, negative_prompt, prompt_embeds, negative_prompt_embeds, num_frames)
|
||||
|
||||
# Sort the prompts based on frame indices
|
||||
prompt = dict(sorted(prompt.items()))
|
||||
negative_prompt = dict(sorted(negative_prompt.items()))
|
||||
|
||||
# Ensure that we have a prompt for the last frame index
|
||||
prompt[num_frames - 1] = prompt[list(prompt.keys())[-1]]
|
||||
negative_prompt[num_frames - 1] = negative_prompt[list(negative_prompt.keys())[-1]]
|
||||
|
||||
frame_indices = list(prompt.keys())
|
||||
frame_prompts = list(prompt.values())
|
||||
frame_negative_indices = list(negative_prompt.keys())
|
||||
frame_negative_prompts = list(negative_prompt.values())
|
||||
|
||||
# Generate and bucketify positive prompts
|
||||
prompt_embeds, _ = self.encode_prompt(
|
||||
prompt=frame_prompts,
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=False,
|
||||
negative_prompt=None,
|
||||
prompt_embeds=None,
|
||||
negative_prompt_embeds=None,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
|
||||
# If many prompts fall into the same latent frame bucket, pool them
|
||||
prompt_embeds_frame_buckets = collections.defaultdict(lambda: [])
|
||||
for i in range(len(frame_indices)):
|
||||
latent_frame_index = frame_indices[i] // self.transformer.config.temporal_compression_ratio
|
||||
prompt_embeds_frame_buckets[latent_frame_index].append(prompt_embeds[i].unsqueeze(0))
|
||||
|
||||
for frame_index, embeds in list(prompt_embeds_frame_buckets.items()):
|
||||
prompt_embeds_frame_buckets[frame_index] = self._free_noise_prompt_pooling_callback(embeds)
|
||||
|
||||
# Generate and bucketify negative prompts
|
||||
negative_prompt_embeds_frame_buckets = None
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
_, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt=[""] * len(frame_negative_prompts),
|
||||
device=device,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
do_classifier_free_guidance=True,
|
||||
negative_prompt=frame_negative_prompts,
|
||||
prompt_embeds=None,
|
||||
negative_prompt_embeds=None,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
|
||||
# If many prompts fall into the same latent frame bucket, pool them
|
||||
negative_prompt_embeds_frame_buckets = collections.defaultdict(lambda: [])
|
||||
for i in range(len(frame_negative_indices)):
|
||||
latent_frame_index = frame_negative_indices[i] // self.transformer.config.temporal_compression_ratio
|
||||
negative_prompt_embeds_frame_buckets[latent_frame_index].append(negative_prompt_embeds[i].unsqueeze(0))
|
||||
|
||||
for frame_index, embeds in list(negative_prompt_embeds_frame_buckets.items()):
|
||||
negative_prompt_embeds_frame_buckets[frame_index] = self._free_noise_prompt_pooling_callback(embeds)
|
||||
|
||||
prompt_embeds_frame_buckets = (negative_prompt_embeds_frame_buckets, prompt_embeds_frame_buckets)
|
||||
else:
|
||||
prompt_embeds_frame_buckets = (prompt_embeds_frame_buckets,)
|
||||
|
||||
return prompt_embeds_frame_buckets, negative_prompt_embeds_frame_buckets
|
||||
|
||||
def _prepare_latents_free_noise(
|
||||
self,
|
||||
batch_size: int,
|
||||
num_channels_latents: int,
|
||||
num_frames: int,
|
||||
height: int,
|
||||
width: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
):
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
context_num_frames = (
|
||||
self._free_noise_context_length if self._free_noise_context_length == "repeat_context" else num_frames
|
||||
)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
context_num_frames,
|
||||
num_channels_latents,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
)
|
||||
|
||||
if latents is None:
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
if self._free_noise_noise_type == "random":
|
||||
return latents
|
||||
else:
|
||||
if latents.size(2) == num_frames:
|
||||
return latents
|
||||
elif latents.size(2) != self._free_noise_context_length:
|
||||
raise ValueError(
|
||||
f"You have passed `latents` as a parameter to FreeNoise. The expected number of frames is either {num_frames} or {self._free_noise_context_length}, but found {latents.size(2)}"
|
||||
)
|
||||
latents = latents.to(device)
|
||||
|
||||
if self._free_noise_noise_type == "shuffle_context":
|
||||
for i in range(self._free_noise_context_length, num_frames, self._free_noise_context_stride):
|
||||
# ensure window is within bounds
|
||||
window_start = max(0, i - self._free_noise_context_length)
|
||||
window_end = min(num_frames, window_start + self._free_noise_context_stride)
|
||||
window_length = window_end - window_start
|
||||
|
||||
if window_length == 0:
|
||||
break
|
||||
|
||||
indices = torch.LongTensor(list(range(window_start, window_end)))
|
||||
shuffled_indices = indices[torch.randperm(window_length, generator=generator)]
|
||||
|
||||
current_start = i
|
||||
current_end = min(num_frames, current_start + window_length)
|
||||
if current_end == current_start + window_length:
|
||||
# batch of frames perfectly fits the window
|
||||
latents[:, current_start:current_end] = latents[:, shuffled_indices]
|
||||
else:
|
||||
# handle the case where the last batch of frames does not fit perfectly with the window
|
||||
prefix_length = current_end - current_start
|
||||
shuffled_indices = shuffled_indices[:prefix_length]
|
||||
latents[:, current_start:current_end] = latents[:, shuffled_indices]
|
||||
|
||||
elif self._free_noise_noise_type == "repeat_context":
|
||||
num_repeats = (num_frames + self._free_noise_context_length - 1) // self._free_noise_context_length
|
||||
latents = torch.cat([latents] * num_repeats, dim=1)
|
||||
|
||||
latents = latents[:, :num_frames]
|
||||
return latents
|
||||
|
||||
def enable_free_noise(
|
||||
self,
|
||||
context_length: Optional[int] = 13, # 49 pixel-space frames
|
||||
context_stride: int = 4, # 16 pixel-space frames
|
||||
weighting_scheme: str = "pyramid",
|
||||
noise_type: str = "shuffle_context",
|
||||
prompt_interpolation_callback: Optional[Callable[[int, int, torch.Tensor, torch.Tensor], torch.Tensor]] = None,
|
||||
prompt_pooling_type_or_callback: Optional[Union[str, Callable[[List[torch.Tensor]], torch.Tensor]]] = None,
|
||||
) -> None:
|
||||
r"""
|
||||
Enable long video generation using FreeNoise.
|
||||
|
||||
Args:
|
||||
context_length (`int`, defaults to `16`, *optional*):
|
||||
The number of video frames to process at once. It's recommended to set this to the maximum frames the
|
||||
Motion Adapter was trained with (usually 16/24/32). If `None`, the default value from the motion
|
||||
adapter config is used.
|
||||
context_stride (`int`, *optional*):
|
||||
Long videos are generated by processing many frames. FreeNoise processes these frames in sliding
|
||||
windows of size `context_length`. Context stride allows you to specify how many frames to skip between
|
||||
each window. For example, a context length of 16 and context stride of 4 would process 24 frames as:
|
||||
[0, 15], [4, 19], [8, 23] (0-based indexing)
|
||||
weighting_scheme (`str`, defaults to `pyramid`):
|
||||
Weighting scheme for averaging latents after accumulation in FreeNoise blocks. The following weighting
|
||||
schemes are supported currently:
|
||||
- "flat"
|
||||
Performs weighting averaging with a flat weight pattern: [1, 1, 1, 1, 1].
|
||||
- "pyramid"
|
||||
Performs weighted averaging with a pyramid like weight pattern: [1, 2, 3, 2, 1].
|
||||
- "delayed_reverse_sawtooth"
|
||||
Performs weighted averaging with low weights for earlier frames and high-to-low weights for
|
||||
later frames: [0.01, 0.01, 3, 2, 1].
|
||||
noise_type (`str`, defaults to "shuffle_context"):
|
||||
Must be one of ["shuffle_context", "repeat_context", "random"].
|
||||
- "shuffle_context"
|
||||
Shuffles a fixed batch of `context_length` latents to create a final latent of size
|
||||
`num_frames`. This is usually the best setting for most generation scenarious. However, there
|
||||
might be visible repetition noticeable in the kinds of motion/animation generated.
|
||||
- "repeated_context"
|
||||
Repeats a fixed batch of `context_length` latents to create a final latent of size
|
||||
`num_frames`.
|
||||
- "random"
|
||||
The final latents are random without any repetition.
|
||||
"""
|
||||
|
||||
allowed_weighting_scheme = ["flat", "pyramid", "delayed_reverse_sawtooth"]
|
||||
allowed_noise_type = ["shuffle_context", "repeat_context", "random"]
|
||||
|
||||
# Note: we need to do this because when CogVideoX was originally introduced, it used the wrong value
|
||||
# for `sample_frames`. It should have been 13 and not 49 because the expected number of latent frames
|
||||
# in the transformer is 13.
|
||||
# Maybe try and look into what can be done before 1.0.0 release.
|
||||
if context_length % 2 == 0:
|
||||
sample_frames_context_length = context_length * self.transformer.config.temporal_compression_ratio
|
||||
else:
|
||||
sample_frames_context_length = (
|
||||
context_length - 1
|
||||
) * self.transformer.config.temporal_compression_ratio + 1
|
||||
|
||||
if sample_frames_context_length > self.transformer.config.sample_frames:
|
||||
logger.warning(
|
||||
f"You have set {context_length=} which is greater than {self.motion_adapter.config.motion_max_seq_length=}. This can lead to bad generation results."
|
||||
)
|
||||
if weighting_scheme not in allowed_weighting_scheme:
|
||||
raise ValueError(
|
||||
f"The parameter `weighting_scheme` must be one of {allowed_weighting_scheme}, but got {weighting_scheme=}"
|
||||
)
|
||||
if noise_type not in allowed_noise_type:
|
||||
raise ValueError(f"The parameter `noise_type` must be one of {allowed_noise_type}, but got {noise_type=}")
|
||||
|
||||
self._free_noise_context_length = context_length or self.motion_adapter.config.motion_max_seq_length
|
||||
self._free_noise_context_stride = context_stride
|
||||
self._free_noise_weighting_scheme = weighting_scheme
|
||||
self._free_noise_noise_type = noise_type
|
||||
self._free_noise_prompt_interpolation_callback = prompt_interpolation_callback or _lerp
|
||||
|
||||
if prompt_pooling_type_or_callback is None:
|
||||
prompt_pooling_type_or_callback = "pyramid"
|
||||
if isinstance(prompt_pooling_type_or_callback, str):
|
||||
self._free_noise_prompt_pooling_callback = functools.partial(
|
||||
_weighted_pooling, pooling_type=prompt_pooling_type_or_callback
|
||||
)
|
||||
else:
|
||||
self._free_noise_prompt_pooling_callback = prompt_pooling_type_or_callback
|
||||
|
||||
self._enable_free_noise_in_block(self.transformer)
|
||||
|
||||
def disable_free_noise(self) -> None:
|
||||
r"""Disable the FreeNoise sampling mechanism."""
|
||||
self._free_noise_context_length = None
|
||||
|
||||
self._disable_free_noise_in_block(self.transformer)
|
||||
|
||||
@property
|
||||
def free_noise_enabled(self):
|
||||
return hasattr(self, "_free_noise_context_length") and self._free_noise_context_length is not None
|
||||
|
||||
@@ -547,7 +547,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
||||
negative_image_embeds = prior_outputs[1]
|
||||
|
||||
prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
|
||||
image = [image] if isinstance(image, PIL.Image.Image) else image
|
||||
image = [image] if isinstance(prompt, PIL.Image.Image) else image
|
||||
|
||||
if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
|
||||
prompt = (image_embeds.shape[0] // len(prompt)) * prompt
|
||||
@@ -813,7 +813,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
||||
negative_image_embeds = prior_outputs[1]
|
||||
|
||||
prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
|
||||
image = [image] if isinstance(image, PIL.Image.Image) else image
|
||||
image = [image] if isinstance(prompt, PIL.Image.Image) else image
|
||||
mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
|
||||
|
||||
if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
|
||||
|
||||
@@ -564,16 +564,14 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -581,7 +579,7 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -592,12 +590,11 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
|
||||
def prepare_latents(
|
||||
self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
|
||||
|
||||
@@ -734,8 +734,6 @@ class AnimateDiffPAGPipeline(
|
||||
elif self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image,
|
||||
@@ -807,9 +805,7 @@ class AnimateDiffPAGPipeline(
|
||||
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat(
|
||||
[latents] * (prompt_embeds.shape[0] // num_frames // latents.shape[0])
|
||||
)
|
||||
latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
|
||||
@@ -648,16 +648,14 @@ class StableDiffusionXLPAGImg2ImgPipeline(
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -665,7 +663,7 @@ class StableDiffusionXLPAGImg2ImgPipeline(
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -676,12 +674,11 @@ class StableDiffusionXLPAGImg2ImgPipeline(
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
|
||||
def prepare_latents(
|
||||
self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
|
||||
|
||||
@@ -897,16 +897,14 @@ class StableDiffusionXLPAGInpaintPipeline(
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -914,7 +912,7 @@ class StableDiffusionXLPAGInpaintPipeline(
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -925,12 +923,11 @@ class StableDiffusionXLPAGInpaintPipeline(
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
|
||||
def _get_add_time_ids(
|
||||
self,
|
||||
|
||||
@@ -824,8 +824,6 @@ class PIAPipeline(
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
||||
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image,
|
||||
|
||||
@@ -25,7 +25,7 @@ from transformers import (
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
||||
from ...loaders import SD3LoraLoaderMixin
|
||||
from ...models.autoencoders import AutoencoderKL
|
||||
from ...models.transformers import SD3Transformer2DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
@@ -148,7 +148,7 @@ def retrieve_timesteps(
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
|
||||
class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Args:
|
||||
transformer ([`SD3Transformer2DModel`]):
|
||||
|
||||
+11
-14
@@ -640,16 +640,14 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -657,7 +655,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -668,12 +666,11 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(
|
||||
self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
|
||||
):
|
||||
|
||||
+11
-14
@@ -901,16 +901,14 @@ class StableDiffusionXLInpaintPipeline(
|
||||
if denoising_start is None:
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
else:
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
t_start = 0
|
||||
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
|
||||
# Strength is irrelevant if we directly request a timestep to start at;
|
||||
# that is, strength is determined by the denoising_start instead.
|
||||
if denoising_start is not None:
|
||||
discrete_timestep_cutoff = int(
|
||||
round(
|
||||
self.scheduler.config.num_train_timesteps
|
||||
@@ -918,7 +916,7 @@ class StableDiffusionXLInpaintPipeline(
|
||||
)
|
||||
)
|
||||
|
||||
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
||||
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
||||
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
||||
# if the scheduler is a 2nd order scheduler we might have to do +1
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
@@ -929,12 +927,11 @@ class StableDiffusionXLInpaintPipeline(
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
||||
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start)
|
||||
timesteps = timesteps[-num_inference_steps:]
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
|
||||
def _get_add_time_ids(
|
||||
self,
|
||||
|
||||
@@ -38,20 +38,7 @@ class BatchedBrownianTree:
|
||||
except TypeError:
|
||||
seed = [seed]
|
||||
self.batched = False
|
||||
self.trees = [
|
||||
torchsde.BrownianInterval(
|
||||
t0=t0,
|
||||
t1=t1,
|
||||
size=w0.shape,
|
||||
dtype=w0.dtype,
|
||||
device=w0.device,
|
||||
entropy=s,
|
||||
tol=1e-6,
|
||||
pool_size=24,
|
||||
halfway_tree=True,
|
||||
)
|
||||
for s in seed
|
||||
]
|
||||
self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
|
||||
|
||||
@staticmethod
|
||||
def sort(a, b):
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import contextlib
|
||||
import copy
|
||||
import gc
|
||||
import math
|
||||
import random
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
@@ -260,22 +259,6 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
|
||||
return weighting
|
||||
|
||||
|
||||
def clear_objs_and_retain_memory(objs: List[Any]):
|
||||
"""Deletes `objs` and runs garbage collection. Then clears the cache of the available accelerator."""
|
||||
if len(objs) >= 1:
|
||||
for obj in objs:
|
||||
del obj
|
||||
|
||||
gc.collect()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
elif torch.backends.mps.is_available():
|
||||
torch.mps.empty_cache()
|
||||
elif is_torch_npu_available():
|
||||
torch_npu.empty_cache()
|
||||
|
||||
|
||||
# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
|
||||
class EMAModel:
|
||||
"""
|
||||
@@ -435,11 +418,11 @@ class EMAModel:
|
||||
one_minus_decay = 1 - decay
|
||||
|
||||
context_manager = contextlib.nullcontext
|
||||
if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
|
||||
if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
|
||||
import deepspeed
|
||||
|
||||
if self.foreach:
|
||||
if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
|
||||
if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
|
||||
context_manager = deepspeed.zero.GatheredParameters(parameters, modifier_rank=None)
|
||||
|
||||
with context_manager():
|
||||
@@ -461,7 +444,7 @@ class EMAModel:
|
||||
|
||||
else:
|
||||
for s_param, param in zip(self.shadow_params, parameters):
|
||||
if is_transformers_available() and transformers.integrations.deepspeed.is_deepspeed_zero3_enabled():
|
||||
if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
|
||||
context_manager = deepspeed.zero.GatheredParameters(param, modifier_rank=None)
|
||||
|
||||
with context_manager():
|
||||
|
||||
@@ -197,21 +197,6 @@ class FluxControlNetModel(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class FluxMultiControlNetModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class FluxTransformer2DModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -152,21 +152,6 @@ class AnimateDiffSparseControlNetPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class AnimateDiffVideoToVideoControlNetPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
@@ -287,21 +272,6 @@ class CogVideoXPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class CogVideoXVideoToVideoPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class CycleDiffusionPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
@@ -332,36 +302,6 @@ class FluxControlNetPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class FluxImg2ImgPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class FluxInpaintPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class FluxPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
|
||||
@@ -157,12 +157,11 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
|
||||
if ("adapter-1" in n or "adapter-2" in n) and not isinstance(m, (nn.Dropout, nn.Identity)):
|
||||
self.assertTrue(m.weight.device != torch.device("cpu"))
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
def test_integration_move_lora_dora_cpu(self):
|
||||
from peft import LoraConfig
|
||||
|
||||
path = "Lykon/dreamshaper-8"
|
||||
path = "runwayml/stable-diffusion-v1-5"
|
||||
unet_lora_config = LoraConfig(
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
|
||||
@@ -528,10 +528,6 @@ class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCa
|
||||
def test_forward_with_norm_groups(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("No attention module used in this model")
|
||||
def test_set_attn_processor_for_determinism(self):
|
||||
return
|
||||
|
||||
|
||||
@slow
|
||||
class AutoencoderTinyIntegrationTests(unittest.TestCase):
|
||||
@@ -1036,9 +1032,9 @@ class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/img2img/sketch-mountains-input.jpg"
|
||||
).resize((256, 256))
|
||||
image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to(
|
||||
torch_device
|
||||
)
|
||||
image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[
|
||||
None, :, :, :
|
||||
].cuda()
|
||||
|
||||
latent = vae.encode(image).latent_dist.mean
|
||||
|
||||
@@ -1079,7 +1075,7 @@ class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
|
||||
image = (
|
||||
torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :]
|
||||
.half()
|
||||
.to(torch_device)
|
||||
.cuda()
|
||||
)
|
||||
|
||||
latent = vae.encode(image).latent_dist.mean
|
||||
|
||||
@@ -55,6 +55,17 @@ class EmbeddingsTests(unittest.TestCase):
|
||||
assert grad > prev_grad
|
||||
prev_grad = grad
|
||||
|
||||
def test_timestep_defaults(self):
|
||||
embedding_dim = 16
|
||||
timesteps = torch.arange(10)
|
||||
|
||||
t1 = get_timestep_embedding(timesteps, embedding_dim)
|
||||
t2 = get_timestep_embedding(
|
||||
timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, max_period=10_000
|
||||
)
|
||||
|
||||
assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
|
||||
|
||||
def test_timestep_flip_sin_cos(self):
|
||||
embedding_dim = 16
|
||||
timesteps = torch.arange(10)
|
||||
|
||||
@@ -183,6 +183,17 @@ class ModelUtilsTest(unittest.TestCase):
|
||||
|
||||
|
||||
class UNetTesterMixin:
|
||||
def test_forward_signature(self):
|
||||
init_dict, _ = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
model = self.model_class(**init_dict)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["sample", "timestep"]
|
||||
self.assertListEqual(arg_names[:2], expected_arg_names)
|
||||
|
||||
def test_forward_with_norm_groups(self):
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
@@ -209,7 +220,6 @@ class ModelTesterMixin:
|
||||
base_precision = 1e-3
|
||||
forward_requires_fresh_args = False
|
||||
model_split_percents = [0.5, 0.7, 0.9]
|
||||
uses_custom_attn_processor = False
|
||||
|
||||
def check_device_map_is_respected(self, model, device_map):
|
||||
for param_name, param in model.named_parameters():
|
||||
@@ -407,9 +417,6 @@ class ModelTesterMixin:
|
||||
|
||||
@require_torch_gpu
|
||||
def test_set_attn_processor_for_determinism(self):
|
||||
if self.uses_custom_attn_processor:
|
||||
return
|
||||
|
||||
torch.use_deterministic_algorithms(False)
|
||||
if self.forward_requires_fresh_args:
|
||||
model = self.model_class(**self.init_dict)
|
||||
|
||||
@@ -32,7 +32,6 @@ enable_full_determinism()
|
||||
class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = CogVideoXTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
|
||||
@@ -32,9 +32,6 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
# We override the items here because the transformer under consideration is small.
|
||||
model_split_percents = [0.7, 0.6, 0.6]
|
||||
|
||||
# Skip setting testing with default: AttnProcessor
|
||||
uses_custom_attn_processor = True
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
batch_size = 1
|
||||
|
||||
@@ -32,7 +32,6 @@ enable_full_determinism()
|
||||
class LuminaNextDiT2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = LuminaNextDiT2DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
|
||||
@@ -51,7 +51,7 @@ class UNetMotionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase)
|
||||
|
||||
noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
|
||||
time_step = torch.tensor([10]).to(torch_device)
|
||||
encoder_hidden_states = floats_tensor((batch_size * num_frames, 4, 16)).to(torch_device)
|
||||
encoder_hidden_states = floats_tensor((batch_size, 4, 16)).to(torch_device)
|
||||
|
||||
return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
|
||||
|
||||
|
||||
@@ -175,7 +175,7 @@ class AnimateDiffPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
@@ -209,7 +209,7 @@ class AnimateDiffPipelineFastTests(
|
||||
0.5620,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
@@ -460,53 +460,6 @@ class AnimateDiffPipelineFastTests(
|
||||
"Disabling of FreeNoise should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_split_inference(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
pipe.enable_free_noise(8, 4)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device)
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
# Test FreeNoise with split inference memory-optimization
|
||||
pipe.enable_free_noise_split_inference(spatial_split_size=16, temporal_split_size=4)
|
||||
|
||||
inputs_enable_split_inference = self.get_dummy_inputs(torch_device)
|
||||
frames_enable_split_inference = pipe(**inputs_enable_split_inference).frames[0]
|
||||
|
||||
sum_split_inference = np.abs(to_np(frames_normal) - to_np(frames_enable_split_inference)).sum()
|
||||
self.assertLess(
|
||||
sum_split_inference,
|
||||
1e-4,
|
||||
"Enabling FreeNoise Split Inference memory-optimizations should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_multi_prompt(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
context_length = 8
|
||||
context_stride = 4
|
||||
pipe.enable_free_noise(context_length, context_stride)
|
||||
|
||||
# Make sure that pipeline works when prompt indices are within num_frames bounds
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
|
||||
inputs["num_frames"] = 16
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Ensure that prompt indices are within bounds
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["num_frames"] = 16
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
|
||||
@@ -193,7 +193,7 @@ class AnimateDiffControlNetPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
@@ -218,7 +218,7 @@ class AnimateDiffControlNetPipelineFastTests(
|
||||
0.5155,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
@@ -476,27 +476,6 @@ class AnimateDiffControlNetPipelineFastTests(
|
||||
"Disabling of FreeNoise should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_multi_prompt(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
context_length = 8
|
||||
context_stride = 4
|
||||
pipe.enable_free_noise(context_length, context_stride)
|
||||
|
||||
# Make sure that pipeline works when prompt indices are within num_frames bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Ensure that prompt indices are within bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
def test_vae_slicing(self, video_count=2):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
|
||||
@@ -195,7 +195,7 @@ class AnimateDiffSparseControlNetPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
@@ -220,7 +220,7 @@ class AnimateDiffSparseControlNetPipelineFastTests(
|
||||
0.5155,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
|
||||
@@ -175,7 +175,7 @@ class AnimateDiffVideoToVideoPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
|
||||
if torch_device == "cpu":
|
||||
@@ -201,7 +201,7 @@ class AnimateDiffVideoToVideoPipelineFastTests(
|
||||
0.5378,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_inference_batch_single_identical(
|
||||
self,
|
||||
@@ -491,56 +491,3 @@ class AnimateDiffVideoToVideoPipelineFastTests(
|
||||
1e-4,
|
||||
"Disabling of FreeNoise should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_split_inference(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
pipe.enable_free_noise(8, 4)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs_normal["num_inference_steps"] = 2
|
||||
inputs_normal["strength"] = 0.5
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
# Test FreeNoise with split inference memory-optimization
|
||||
pipe.enable_free_noise_split_inference(spatial_split_size=16, temporal_split_size=4)
|
||||
|
||||
inputs_enable_split_inference = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs_enable_split_inference["num_inference_steps"] = 2
|
||||
inputs_enable_split_inference["strength"] = 0.5
|
||||
frames_enable_split_inference = pipe(**inputs_enable_split_inference).frames[0]
|
||||
|
||||
sum_split_inference = np.abs(to_np(frames_normal) - to_np(frames_enable_split_inference)).sum()
|
||||
self.assertLess(
|
||||
sum_split_inference,
|
||||
1e-4,
|
||||
"Enabling FreeNoise Split Inference memory-optimizations should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_multi_prompt(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
context_length = 8
|
||||
context_stride = 4
|
||||
pipe.enable_free_noise(context_length, context_stride)
|
||||
|
||||
# Make sure that pipeline works when prompt indices are within num_frames bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
|
||||
inputs["num_inference_steps"] = 2
|
||||
inputs["strength"] = 0.5
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Ensure that prompt indices are within bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["num_inference_steps"] = 2
|
||||
inputs["strength"] = 0.5
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
@@ -1,535 +0,0 @@
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
import diffusers
|
||||
from diffusers import (
|
||||
AnimateDiffVideoToVideoControlNetPipeline,
|
||||
AutoencoderKL,
|
||||
ControlNetModel,
|
||||
DDIMScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
LCMScheduler,
|
||||
MotionAdapter,
|
||||
StableDiffusionPipeline,
|
||||
UNet2DConditionModel,
|
||||
UNetMotionModel,
|
||||
)
|
||||
from diffusers.models.attention import FreeNoiseTransformerBlock
|
||||
from diffusers.utils import is_xformers_available, logging
|
||||
from diffusers.utils.testing_utils import torch_device
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
|
||||
from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
|
||||
|
||||
|
||||
def to_np(tensor):
|
||||
if isinstance(tensor, torch.Tensor):
|
||||
tensor = tensor.detach().cpu().numpy()
|
||||
|
||||
return tensor
|
||||
|
||||
|
||||
class AnimateDiffVideoToVideoControlNetPipelineFastTests(
|
||||
IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
|
||||
):
|
||||
pipeline_class = AnimateDiffVideoToVideoControlNetPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = VIDEO_TO_VIDEO_BATCH_PARAMS.union({"conditioning_frames"})
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
]
|
||||
)
|
||||
|
||||
def get_dummy_components(self):
|
||||
cross_attention_dim = 8
|
||||
block_out_channels = (8, 8)
|
||||
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=2,
|
||||
sample_size=8,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="linear",
|
||||
clip_sample=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
controlnet = ControlNetModel(
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=2,
|
||||
in_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
conditioning_embedding_out_channels=(8, 8),
|
||||
norm_num_groups=1,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=block_out_channels,
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=cross_attention_dim,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
torch.manual_seed(0)
|
||||
motion_adapter = MotionAdapter(
|
||||
block_out_channels=block_out_channels,
|
||||
motion_layers_per_block=2,
|
||||
motion_norm_num_groups=2,
|
||||
motion_num_attention_heads=4,
|
||||
)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"controlnet": controlnet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"motion_adapter": motion_adapter,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0, num_frames: int = 2):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
video_height = 32
|
||||
video_width = 32
|
||||
video = [Image.new("RGB", (video_width, video_height))] * num_frames
|
||||
|
||||
video_height = 32
|
||||
video_width = 32
|
||||
conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
|
||||
|
||||
inputs = {
|
||||
"video": video,
|
||||
"conditioning_frames": conditioning_frames,
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 7.5,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_from_pipe_consistent_config(self):
|
||||
assert self.original_pipeline_class == StableDiffusionPipeline
|
||||
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
|
||||
original_kwargs = {"requires_safety_checker": False}
|
||||
|
||||
# create original_pipeline_class(sd)
|
||||
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
|
||||
|
||||
# original_pipeline_class(sd) -> pipeline_class
|
||||
pipe_components = self.get_dummy_components()
|
||||
pipe_additional_components = {}
|
||||
for name, component in pipe_components.items():
|
||||
if name not in pipe_original.components:
|
||||
pipe_additional_components[name] = component
|
||||
|
||||
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
|
||||
|
||||
# pipeline_class -> original_pipeline_class(sd)
|
||||
original_pipe_additional_components = {}
|
||||
for name, component in pipe_original.components.items():
|
||||
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
|
||||
original_pipe_additional_components[name] = component
|
||||
|
||||
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
|
||||
|
||||
# compare the config
|
||||
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
|
||||
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
|
||||
assert original_config_2 == original_config
|
||||
|
||||
def test_motion_unet_loading(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = AnimateDiffVideoToVideoControlNetPipeline(**components)
|
||||
|
||||
assert isinstance(pipe.unet, UNetMotionModel)
|
||||
|
||||
@unittest.skip("Attention slicing is not enabled in this pipeline")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_ip_adapter(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
[
|
||||
0.5569,
|
||||
0.6250,
|
||||
0.4144,
|
||||
0.5613,
|
||||
0.5563,
|
||||
0.5213,
|
||||
0.5091,
|
||||
0.4950,
|
||||
0.4950,
|
||||
0.5684,
|
||||
0.3858,
|
||||
0.4863,
|
||||
0.6457,
|
||||
0.4311,
|
||||
0.5517,
|
||||
0.5608,
|
||||
0.4417,
|
||||
0.5377,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_inference_batch_single_identical(
|
||||
self,
|
||||
batch_size=2,
|
||||
expected_max_diff=1e-4,
|
||||
additional_params_copy_to_batched_inputs=["num_inference_steps"],
|
||||
):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for components in pipe.components.values():
|
||||
if hasattr(components, "set_default_attn_processor"):
|
||||
components.set_default_attn_processor()
|
||||
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
# Reset generator in case it is has been used in self.get_dummy_inputs
|
||||
inputs["generator"] = self.get_generator(0)
|
||||
|
||||
logger = logging.get_logger(pipe.__module__)
|
||||
logger.setLevel(level=diffusers.logging.FATAL)
|
||||
|
||||
# batchify inputs
|
||||
batched_inputs = {}
|
||||
batched_inputs.update(inputs)
|
||||
|
||||
for name in self.batch_params:
|
||||
if name not in inputs:
|
||||
continue
|
||||
|
||||
value = inputs[name]
|
||||
if name == "prompt":
|
||||
len_prompt = len(value)
|
||||
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
|
||||
batched_inputs[name][-1] = 100 * "very long"
|
||||
|
||||
else:
|
||||
batched_inputs[name] = batch_size * [value]
|
||||
|
||||
if "generator" in inputs:
|
||||
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
|
||||
|
||||
if "batch_size" in inputs:
|
||||
batched_inputs["batch_size"] = batch_size
|
||||
|
||||
for arg in additional_params_copy_to_batched_inputs:
|
||||
batched_inputs[arg] = inputs[arg]
|
||||
|
||||
output = pipe(**inputs)
|
||||
output_batch = pipe(**batched_inputs)
|
||||
|
||||
assert output_batch[0].shape[0] == batch_size
|
||||
|
||||
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
|
||||
assert max_diff < expected_max_diff
|
||||
|
||||
@unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
|
||||
def test_to_device(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.to("cpu")
|
||||
# pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == "cpu" for device in model_devices))
|
||||
|
||||
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
|
||||
self.assertTrue(np.isnan(output_cpu).sum() == 0)
|
||||
|
||||
pipe.to("cuda")
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == "cuda" for device in model_devices))
|
||||
|
||||
output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
|
||||
self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
|
||||
|
||||
def test_to_dtype(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
|
||||
|
||||
pipe.to(dtype=torch.float16)
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
|
||||
|
||||
def test_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs.pop("prompt")
|
||||
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
|
||||
pipe(**inputs)
|
||||
|
||||
def test_latent_inputs(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
sample_size = pipe.unet.config.sample_size
|
||||
num_frames = len(inputs["conditioning_frames"])
|
||||
inputs["latents"] = torch.randn((1, 4, num_frames, sample_size, sample_size), device=torch_device)
|
||||
inputs.pop("video")
|
||||
pipe(**inputs)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
component.set_default_attn_processor()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_without_offload = pipe(**inputs).frames[0]
|
||||
output_without_offload = (
|
||||
output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
|
||||
)
|
||||
|
||||
pipe.enable_xformers_memory_efficient_attention()
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_with_offload = pipe(**inputs).frames[0]
|
||||
output_with_offload = (
|
||||
output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
|
||||
)
|
||||
|
||||
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
|
||||
self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
|
||||
|
||||
def test_free_init(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device)
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
pipe.enable_free_init(
|
||||
num_iters=2,
|
||||
use_fast_sampling=True,
|
||||
method="butterworth",
|
||||
order=4,
|
||||
spatial_stop_frequency=0.25,
|
||||
temporal_stop_frequency=0.25,
|
||||
)
|
||||
inputs_enable_free_init = self.get_dummy_inputs(torch_device)
|
||||
frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
|
||||
|
||||
pipe.disable_free_init()
|
||||
inputs_disable_free_init = self.get_dummy_inputs(torch_device)
|
||||
frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
|
||||
|
||||
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
|
||||
max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
|
||||
self.assertGreater(
|
||||
sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
|
||||
)
|
||||
self.assertLess(
|
||||
max_diff_disabled,
|
||||
1e-4,
|
||||
"Disabling of FreeInit should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_init_with_schedulers(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device)
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
schedulers_to_test = [
|
||||
DPMSolverMultistepScheduler.from_config(
|
||||
components["scheduler"].config,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
algorithm_type="dpmsolver++",
|
||||
steps_offset=1,
|
||||
clip_sample=False,
|
||||
),
|
||||
LCMScheduler.from_config(
|
||||
components["scheduler"].config,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
clip_sample=False,
|
||||
),
|
||||
]
|
||||
components.pop("scheduler")
|
||||
|
||||
for scheduler in schedulers_to_test:
|
||||
components["scheduler"] = scheduler
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
frames_enable_free_init = pipe(**inputs).frames[0]
|
||||
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
|
||||
|
||||
self.assertGreater(
|
||||
sum_enabled,
|
||||
1e1,
|
||||
"Enabling of FreeInit should lead to results different from the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_blocks(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
pipe.enable_free_noise()
|
||||
for block in pipe.unet.down_blocks:
|
||||
for motion_module in block.motion_modules:
|
||||
for transformer_block in motion_module.transformer_blocks:
|
||||
self.assertTrue(
|
||||
isinstance(transformer_block, FreeNoiseTransformerBlock),
|
||||
"Motion module transformer blocks must be an instance of `FreeNoiseTransformerBlock` after enabling FreeNoise.",
|
||||
)
|
||||
|
||||
pipe.disable_free_noise()
|
||||
for block in pipe.unet.down_blocks:
|
||||
for motion_module in block.motion_modules:
|
||||
for transformer_block in motion_module.transformer_blocks:
|
||||
self.assertFalse(
|
||||
isinstance(transformer_block, FreeNoiseTransformerBlock),
|
||||
"Motion module transformer blocks must not be an instance of `FreeNoiseTransformerBlock` after disabling FreeNoise.",
|
||||
)
|
||||
|
||||
def test_free_noise(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs_normal["num_inference_steps"] = 2
|
||||
inputs_normal["strength"] = 0.5
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
for context_length in [8, 9]:
|
||||
for context_stride in [4, 6]:
|
||||
pipe.enable_free_noise(context_length, context_stride)
|
||||
|
||||
inputs_enable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs_enable_free_noise["num_inference_steps"] = 2
|
||||
inputs_enable_free_noise["strength"] = 0.5
|
||||
frames_enable_free_noise = pipe(**inputs_enable_free_noise).frames[0]
|
||||
|
||||
pipe.disable_free_noise()
|
||||
inputs_disable_free_noise = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs_disable_free_noise["num_inference_steps"] = 2
|
||||
inputs_disable_free_noise["strength"] = 0.5
|
||||
frames_disable_free_noise = pipe(**inputs_disable_free_noise).frames[0]
|
||||
|
||||
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_noise)).sum()
|
||||
max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_noise)).max()
|
||||
self.assertGreater(
|
||||
sum_enabled,
|
||||
1e1,
|
||||
"Enabling of FreeNoise should lead to results different from the default pipeline results",
|
||||
)
|
||||
self.assertLess(
|
||||
max_diff_disabled,
|
||||
1e-4,
|
||||
"Disabling of FreeNoise should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_noise_multi_prompt(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: AnimateDiffVideoToVideoControlNetPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
context_length = 8
|
||||
context_stride = 4
|
||||
pipe.enable_free_noise(context_length, context_stride)
|
||||
|
||||
# Make sure that pipeline works when prompt indices are within num_frames bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf"}
|
||||
inputs["num_inference_steps"] = 2
|
||||
inputs["strength"] = 0.5
|
||||
pipe(**inputs).frames[0]
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Ensure that prompt indices are within bounds
|
||||
inputs = self.get_dummy_inputs(torch_device, num_frames=16)
|
||||
inputs["num_inference_steps"] = 2
|
||||
inputs["strength"] = 0.5
|
||||
inputs["prompt"] = {0: "Caterpillar on a leaf", 10: "Butterfly on a leaf", 42: "Error on a leaf"}
|
||||
pipe(**inputs).frames[0]
|
||||
@@ -1,328 +0,0 @@
|
||||
# Copyright 2024 The HuggingFace Team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
PipelineTesterMixin,
|
||||
check_qkv_fusion_matches_attn_procs_length,
|
||||
check_qkv_fusion_processors_exist,
|
||||
to_np,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = CogVideoXVideoToVideoPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"video"})
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
]
|
||||
)
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = CogVideoXTransformer3DModel(
|
||||
# Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
|
||||
# But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
|
||||
# to be 32. The internal dim is product of num_attention_heads and attention_head_dim
|
||||
num_attention_heads=4,
|
||||
attention_head_dim=8,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
time_embed_dim=2,
|
||||
text_embed_dim=32, # Must match with tiny-random-t5
|
||||
num_layers=1,
|
||||
sample_width=16, # latent width: 2 -> final width: 16
|
||||
sample_height=16, # latent height: 2 -> final height: 16
|
||||
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
|
||||
patch_size=2,
|
||||
temporal_compression_ratio=4,
|
||||
max_text_seq_length=16,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKLCogVideoX(
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=(
|
||||
"CogVideoXDownBlock3D",
|
||||
"CogVideoXDownBlock3D",
|
||||
"CogVideoXDownBlock3D",
|
||||
"CogVideoXDownBlock3D",
|
||||
),
|
||||
up_block_types=(
|
||||
"CogVideoXUpBlock3D",
|
||||
"CogVideoXUpBlock3D",
|
||||
"CogVideoXUpBlock3D",
|
||||
"CogVideoXUpBlock3D",
|
||||
),
|
||||
block_out_channels=(8, 8, 8, 8),
|
||||
latent_channels=4,
|
||||
layers_per_block=1,
|
||||
norm_num_groups=2,
|
||||
temporal_compression_ratio=4,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
"transformer": transformer,
|
||||
"vae": vae,
|
||||
"scheduler": scheduler,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 8):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
video_height = 16
|
||||
video_width = 16
|
||||
video = [Image.new("RGB", (video_width, video_height))] * num_frames
|
||||
|
||||
inputs = {
|
||||
"video": video,
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"strength": 0.5,
|
||||
"guidance_scale": 6.0,
|
||||
# Cannot reduce because convolution kernel becomes bigger than sample
|
||||
"height": video_height,
|
||||
"width": video_width,
|
||||
"max_sequence_length": 16,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
video = pipe(**inputs).frames
|
||||
generated_video = video[0]
|
||||
|
||||
self.assertEqual(generated_video.shape, (8, 3, 16, 16))
|
||||
expected_video = torch.randn(8, 3, 16, 16)
|
||||
max_diff = np.abs(generated_video - expected_video).max()
|
||||
self.assertLessEqual(max_diff, 1e10)
|
||||
|
||||
def test_callback_inputs(self):
|
||||
sig = inspect.signature(self.pipeline_class.__call__)
|
||||
has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
|
||||
has_callback_step_end = "callback_on_step_end" in sig.parameters
|
||||
|
||||
if not (has_callback_tensor_inputs and has_callback_step_end):
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
self.assertTrue(
|
||||
hasattr(pipe, "_callback_tensor_inputs"),
|
||||
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
|
||||
)
|
||||
|
||||
def callback_inputs_subset(pipe, i, t, callback_kwargs):
|
||||
# iterate over callback args
|
||||
for tensor_name, tensor_value in callback_kwargs.items():
|
||||
# check that we're only passing in allowed tensor inputs
|
||||
assert tensor_name in pipe._callback_tensor_inputs
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
def callback_inputs_all(pipe, i, t, callback_kwargs):
|
||||
for tensor_name in pipe._callback_tensor_inputs:
|
||||
assert tensor_name in callback_kwargs
|
||||
|
||||
# iterate over callback args
|
||||
for tensor_name, tensor_value in callback_kwargs.items():
|
||||
# check that we're only passing in allowed tensor inputs
|
||||
assert tensor_name in pipe._callback_tensor_inputs
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
|
||||
# Test passing in a subset
|
||||
inputs["callback_on_step_end"] = callback_inputs_subset
|
||||
inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
|
||||
output = pipe(**inputs)[0]
|
||||
|
||||
# Test passing in a everything
|
||||
inputs["callback_on_step_end"] = callback_inputs_all
|
||||
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
|
||||
output = pipe(**inputs)[0]
|
||||
|
||||
def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
|
||||
is_last = i == (pipe.num_timesteps - 1)
|
||||
if is_last:
|
||||
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
|
||||
return callback_kwargs
|
||||
|
||||
inputs["callback_on_step_end"] = callback_inputs_change_tensor
|
||||
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
|
||||
output = pipe(**inputs)[0]
|
||||
assert output.abs().sum() < 1e10
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
|
||||
|
||||
def test_attention_slicing_forward_pass(
|
||||
self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
|
||||
):
|
||||
if not self.test_attention_slicing:
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
component.set_default_attn_processor()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator_device = "cpu"
|
||||
inputs = self.get_dummy_inputs(generator_device)
|
||||
output_without_slicing = pipe(**inputs)[0]
|
||||
|
||||
pipe.enable_attention_slicing(slice_size=1)
|
||||
inputs = self.get_dummy_inputs(generator_device)
|
||||
output_with_slicing1 = pipe(**inputs)[0]
|
||||
|
||||
pipe.enable_attention_slicing(slice_size=2)
|
||||
inputs = self.get_dummy_inputs(generator_device)
|
||||
output_with_slicing2 = pipe(**inputs)[0]
|
||||
|
||||
if test_max_difference:
|
||||
max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
|
||||
max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
|
||||
self.assertLess(
|
||||
max(max_diff1, max_diff2),
|
||||
expected_max_diff,
|
||||
"Attention slicing should not affect the inference results",
|
||||
)
|
||||
|
||||
def test_vae_tiling(self, expected_diff_max: float = 0.2):
|
||||
# Since VideoToVideo uses both encoder and decoder tiling, there seems to be much more numerical
|
||||
# difference. We seem to need a higher tolerance here...
|
||||
# TODO(aryan): Look into this more deeply
|
||||
expected_diff_max = 0.4
|
||||
|
||||
generator_device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to("cpu")
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Without tiling
|
||||
inputs = self.get_dummy_inputs(generator_device)
|
||||
inputs["height"] = inputs["width"] = 128
|
||||
output_without_tiling = pipe(**inputs)[0]
|
||||
|
||||
# With tiling
|
||||
pipe.vae.enable_tiling(
|
||||
tile_sample_min_height=96,
|
||||
tile_sample_min_width=96,
|
||||
tile_overlap_factor_height=1 / 12,
|
||||
tile_overlap_factor_width=1 / 12,
|
||||
)
|
||||
inputs = self.get_dummy_inputs(generator_device)
|
||||
inputs["height"] = inputs["width"] = 128
|
||||
output_with_tiling = pipe(**inputs)[0]
|
||||
|
||||
self.assertLess(
|
||||
(to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
|
||||
expected_diff_max,
|
||||
"VAE tiling should not affect the inference results",
|
||||
)
|
||||
|
||||
@unittest.skip("xformers attention processor does not exist for CogVideoX")
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
pass
|
||||
|
||||
def test_fused_qkv_projections(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
frames = pipe(**inputs).frames # [B, F, C, H, W]
|
||||
original_image_slice = frames[0, -2:, -1, -3:, -3:]
|
||||
|
||||
pipe.fuse_qkv_projections()
|
||||
assert check_qkv_fusion_processors_exist(
|
||||
pipe.transformer
|
||||
), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
|
||||
assert check_qkv_fusion_matches_attn_procs_length(
|
||||
pipe.transformer, pipe.transformer.original_attn_processors
|
||||
), "Something wrong with the attention processors concerning the fused QKV projections."
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
frames = pipe(**inputs).frames
|
||||
image_slice_fused = frames[0, -2:, -1, -3:, -3:]
|
||||
|
||||
pipe.transformer.unfuse_qkv_projections()
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
frames = pipe(**inputs).frames
|
||||
image_slice_disabled = frames[0, -2:, -1, -3:, -3:]
|
||||
|
||||
assert np.allclose(
|
||||
original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
|
||||
), "Fusion of QKV projections shouldn't affect the outputs."
|
||||
assert np.allclose(
|
||||
image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
|
||||
), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
|
||||
assert np.allclose(
|
||||
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
|
||||
), "Original outputs should match when fused QKV projections are disabled."
|
||||
@@ -220,11 +220,11 @@ class ControlNetPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.5234, 0.3333, 0.1745, 0.7605, 0.6224, 0.4637, 0.6989, 0.7526, 0.4665])
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
@@ -460,11 +460,11 @@ class StableDiffusionMultiControlNetPipelineFastTests(
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.2422, 0.3425, 0.4048, 0.5351, 0.3503, 0.2419, 0.4645, 0.4570, 0.3804])
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_save_pretrained_raise_not_implemented_exception(self):
|
||||
components = self.get_dummy_components()
|
||||
@@ -679,11 +679,11 @@ class StableDiffusionMultiControlNetOneModelPipelineFastTests(
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.5264, 0.3203, 0.1602, 0.8235, 0.6332, 0.4593, 0.7226, 0.7777, 0.4780])
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_save_pretrained_raise_not_implemented_exception(self):
|
||||
components = self.get_dummy_components()
|
||||
|
||||
@@ -173,11 +173,11 @@ class ControlNetImg2ImgPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.7096, 0.5149, 0.3571, 0.5897, 0.4715, 0.4052, 0.6098, 0.6886, 0.4213])
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
@@ -371,11 +371,11 @@ class StableDiffusionMultiControlNetPipelineFastTests(
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.5293, 0.7339, 0.6642, 0.3950, 0.5212, 0.5175, 0.7002, 0.5907, 0.5182])
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_save_pretrained_raise_not_implemented_exception(self):
|
||||
components = self.get_dummy_components()
|
||||
|
||||
@@ -190,14 +190,14 @@ class StableDiffusionXLControlNetPipelineFastTests(
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
|
||||
|
||||
def test_ip_adapter(self, from_ssd1b=False, expected_pipe_slice=None):
|
||||
def test_ip_adapter_single(self, from_ssd1b=False, expected_pipe_slice=None):
|
||||
if not from_ssd1b:
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
[0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
@@ -970,12 +970,12 @@ class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNe
|
||||
# make sure that it's equal
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032])
|
||||
|
||||
return super().test_ip_adapter(from_ssd1b=True, expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(from_ssd1b=True, expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_controlnet_sdxl_lcm(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
@@ -175,12 +175,12 @@ class ControlNetPipelineSDXLImg2ImgFastTests(
|
||||
|
||||
return inputs
|
||||
|
||||
def test_ip_adapter(self):
|
||||
def test_ip_adapter_single(self):
|
||||
expected_pipe_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array([0.6276, 0.5271, 0.5205, 0.5393, 0.5774, 0.5872, 0.5456, 0.5415, 0.5354])
|
||||
# TODO: update after slices.p
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_stable_diffusion_xl_controlnet_img2img(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
@@ -25,9 +25,6 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
|
||||
batch_params = frozenset(["prompt"])
|
||||
|
||||
# there is no xformers processor for Flux
|
||||
test_xformers_attention = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = FluxTransformer2DModel(
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxImg2ImgPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class FluxImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
pipeline_class = FluxImg2ImgPipeline
|
||||
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
|
||||
batch_params = frozenset(["prompt"])
|
||||
test_xformers_attention = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = FluxTransformer2DModel(
|
||||
patch_size=1,
|
||||
in_channels=4,
|
||||
num_layers=1,
|
||||
num_single_layers=1,
|
||||
attention_head_dim=16,
|
||||
num_attention_heads=2,
|
||||
joint_attention_dim=32,
|
||||
pooled_projection_dim=32,
|
||||
axes_dims_rope=[4, 4, 8],
|
||||
)
|
||||
clip_text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
sample_size=32,
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
block_out_channels=(4,),
|
||||
layers_per_block=1,
|
||||
latent_channels=1,
|
||||
norm_num_groups=1,
|
||||
use_quant_conv=False,
|
||||
use_post_quant_conv=False,
|
||||
shift_factor=0.0609,
|
||||
scaling_factor=1.5035,
|
||||
)
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
|
||||
return {
|
||||
"scheduler": scheduler,
|
||||
"text_encoder": text_encoder,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer": tokenizer,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"vae": vae,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"image": image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 5.0,
|
||||
"height": 8,
|
||||
"width": 8,
|
||||
"max_sequence_length": 48,
|
||||
"strength": 0.8,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_flux_different_prompts(self):
|
||||
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_same_prompt = pipe(**inputs).images[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt_2"] = "a different prompt"
|
||||
output_different_prompts = pipe(**inputs).images[0]
|
||||
|
||||
max_diff = np.abs(output_same_prompt - output_different_prompts).max()
|
||||
|
||||
# Outputs should be different here
|
||||
# For some reasons, they don't show large differences
|
||||
assert max_diff > 1e-6
|
||||
|
||||
def test_flux_prompt_embeds(self):
|
||||
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
|
||||
output_with_prompt = pipe(**inputs).images[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = inputs.pop("prompt")
|
||||
|
||||
(prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
|
||||
prompt,
|
||||
prompt_2=None,
|
||||
device=torch_device,
|
||||
max_sequence_length=inputs["max_sequence_length"],
|
||||
)
|
||||
output_with_embeds = pipe(
|
||||
prompt_embeds=prompt_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
**inputs,
|
||||
).images[0]
|
||||
|
||||
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
|
||||
assert max_diff < 1e-4
|
||||
@@ -1,151 +0,0 @@
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxInpaintPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class FluxInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
pipeline_class = FluxInpaintPipeline
|
||||
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
|
||||
batch_params = frozenset(["prompt"])
|
||||
test_xformers_attention = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = FluxTransformer2DModel(
|
||||
patch_size=1,
|
||||
in_channels=8,
|
||||
num_layers=1,
|
||||
num_single_layers=1,
|
||||
attention_head_dim=16,
|
||||
num_attention_heads=2,
|
||||
joint_attention_dim=32,
|
||||
pooled_projection_dim=32,
|
||||
axes_dims_rope=[4, 4, 8],
|
||||
)
|
||||
clip_text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
sample_size=32,
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
block_out_channels=(4,),
|
||||
layers_per_block=1,
|
||||
latent_channels=2,
|
||||
norm_num_groups=1,
|
||||
use_quant_conv=False,
|
||||
use_post_quant_conv=False,
|
||||
shift_factor=0.0609,
|
||||
scaling_factor=1.5035,
|
||||
)
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
|
||||
return {
|
||||
"scheduler": scheduler,
|
||||
"text_encoder": text_encoder,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer": tokenizer,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"vae": vae,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
mask_image = torch.ones((1, 1, 32, 32)).to(device)
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"image": image,
|
||||
"mask_image": mask_image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 5.0,
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"max_sequence_length": 48,
|
||||
"strength": 0.8,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_flux_inpaint_different_prompts(self):
|
||||
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_same_prompt = pipe(**inputs).images[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt_2"] = "a different prompt"
|
||||
output_different_prompts = pipe(**inputs).images[0]
|
||||
|
||||
max_diff = np.abs(output_same_prompt - output_different_prompts).max()
|
||||
|
||||
# Outputs should be different here
|
||||
# For some reasons, they don't show large differences
|
||||
assert max_diff > 1e-6
|
||||
|
||||
def test_flux_inpaint_prompt_embeds(self):
|
||||
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
|
||||
output_with_prompt = pipe(**inputs).images[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = inputs.pop("prompt")
|
||||
|
||||
(prompt_embeds, pooled_prompt_embeds, text_ids) = pipe.encode_prompt(
|
||||
prompt,
|
||||
prompt_2=None,
|
||||
device=torch_device,
|
||||
max_sequence_length=inputs["max_sequence_length"],
|
||||
)
|
||||
output_with_embeds = pipe(
|
||||
prompt_embeds=prompt_embeds,
|
||||
pooled_prompt_embeds=pooled_prompt_embeds,
|
||||
**inputs,
|
||||
).images[0]
|
||||
|
||||
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
|
||||
assert max_diff < 1e-4
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user