Compare commits

..

15 Commits

Author SHA1 Message Date
Aryan 5a60a62c47 add experimental support for num_frames not perfectly fitting context length, ocntext stride 2024-07-27 16:24:36 +02:00
Aryan 691facfc2e copy animatediff controlnet implementation from #8972 2024-07-27 15:34:39 +02:00
Aryan dc96a8d5cd make style 2024-07-27 15:31:12 +02:00
Aryan 1b7bc007d8 make fix-copies 2024-07-27 15:30:42 +02:00
Aryan 1bb09845bf fix copied from comments 2024-07-27 15:29:30 +02:00
Aryan 024e2da864 make style 2024-07-27 15:27:28 +02:00
Aryan f6897ae46a add decode batch size param to all pipelines 2024-07-27 15:26:14 +02:00
Aryan a41f843dba remove old helper functions 2024-07-27 15:16:45 +02:00
Aryan 10b65b310c add freenoise 2024-07-27 15:16:02 +02:00
Aryan 610f433d1c revert attention changes 2024-07-27 15:15:34 +02:00
Aryan 690dad693f Merge branch 'main' into freenoise 2024-07-27 13:05:34 +02:00
Aryan 2e97ba7ccb Merge branch 'main' into freenoise 2024-07-25 03:55:41 +05:30
Aryan 5d0f4c3407 add animatediff controlnet implementation 2024-07-24 23:54:42 +02:00
Aryan 441d321152 fix freeinit bug 2024-07-24 23:54:29 +02:00
Aryan 80e530fbfa initial work draft for freenoise; needs massive cleanup 2024-07-24 01:38:18 +02:00
26 changed files with 1665 additions and 275 deletions
+3 -5
View File
@@ -20,8 +20,7 @@ env:
jobs:
test-build-docker-images:
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
if: github.event_name == 'pull_request'
steps:
- name: Set up Docker Buildx
@@ -51,8 +50,7 @@ jobs:
if: steps.file_changes.outputs.all != ''
build-and-push-docker-images:
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
if: github.event_name != 'pull_request'
permissions:
@@ -100,4 +98,4 @@ jobs:
slack_channel: ${{ env.CI_SLACK_CHANNEL }}
title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+4 -8
View File
@@ -19,8 +19,7 @@ env:
jobs:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines CUDA Slow Tests Matrix
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
outputs:
@@ -56,8 +55,7 @@ jobs:
max-parallel: 8
matrix:
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
@@ -107,8 +105,7 @@ jobs:
run_nightly_tests_for_other_torch_modules:
name: Nightly Torch CUDA Tests
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
@@ -237,8 +234,7 @@ jobs:
run_nightly_onnx_tests:
name: Nightly ONNXRuntime CUDA tests on Ubuntu
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --gpus 0 --shm-size "16gb" --ipc host
+4 -7
View File
@@ -15,8 +15,7 @@ concurrency:
jobs:
setup_pr_tests:
name: Setup PR Tests
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -74,8 +73,7 @@ jobs:
max-parallel: 2
matrix:
modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -125,13 +123,12 @@ jobs:
config:
- name: Hub tests for models, schedulers, and pipelines
framework: hub_tests_pytorch
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_hub
name: ${{ matrix.config.name }}
runs-on:
group: ${{ matrix.config.runner }}
runs-on: ${{ matrix.config.runner }}
container:
image: ${{ matrix.config.image }}
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+2 -3
View File
@@ -71,8 +71,7 @@ jobs:
name: LoRA - ${{ matrix.lib-versions }}
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
@@ -129,4 +128,4 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: pr_${{ matrix.config.report }}_test_reports
path: reports
path: reports
+6 -8
View File
@@ -77,29 +77,28 @@ jobs:
config:
- name: Fast PyTorch Pipeline CPU tests
framework: pytorch_pipelines
runner: aws-highmemory-32-plus
runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_cpu_pipelines
- name: Fast PyTorch Models & Schedulers CPU tests
framework: pytorch_models
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_cpu_models_schedulers
- name: Fast Flax CPU tests
framework: flax
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-flax-cpu
report: flax_cpu
- name: PyTorch Example CPU tests
framework: pytorch_examples
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_example_cpu
name: ${{ matrix.config.name }}
runs-on:
group: ${{ matrix.config.runner }}
runs-on: ${{ matrix.config.runner }}
container:
image: ${{ matrix.config.image }}
@@ -181,8 +180,7 @@ jobs:
config:
- name: Hub tests for models, schedulers, and pipelines
framework: hub_tests_pytorch
runner:
group: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_hub
+7 -14
View File
@@ -19,8 +19,7 @@ env:
jobs:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines CUDA Slow Tests Matrix
runs-on:
group: aws-general-8-plus
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
outputs:
@@ -58,8 +57,7 @@ jobs:
max-parallel: 8
matrix:
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
@@ -103,8 +101,7 @@ jobs:
torch_cuda_tests:
name: Torch CUDA Tests
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
@@ -204,8 +201,7 @@ jobs:
onnx_cuda_tests:
name: ONNX CUDA Tests
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -253,8 +249,7 @@ jobs:
run_torch_compile_tests:
name: PyTorch Compile CUDA tests
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-compile-cuda
@@ -296,8 +291,7 @@ jobs:
run_xformers_tests:
name: PyTorch xformers CUDA tests
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-xformers-cuda
@@ -338,8 +332,7 @@ jobs:
run_examples_tests:
name: Examples PyTorch CUDA tests on Ubuntu
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
+5 -6
View File
@@ -29,29 +29,28 @@ jobs:
config:
- name: Fast PyTorch CPU tests on Ubuntu
framework: pytorch
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_cpu
- name: Fast Flax CPU tests on Ubuntu
framework: flax
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-flax-cpu
report: flax_cpu
- name: Fast ONNXRuntime CPU tests on Ubuntu
framework: onnxruntime
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-onnxruntime-cpu
report: onnx_cpu
- name: PyTorch Example CPU tests on Ubuntu
framework: pytorch_examples
runner: aws-general-8-plus
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_example_cpu
name: ${{ matrix.config.name }}
runs-on:
group: ${{ matrix.config.runner }}
runs-on: ${{ matrix.config.runner }}
container:
image: ${{ matrix.config.image }}
+2 -3
View File
@@ -26,8 +26,7 @@ env:
jobs:
run_tests:
name: "Run a test on our runner from a PR"
runs-on:
group: aws-g4dn-2xlarge
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: ${{ github.event.inputs.docker_image }}
options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -71,4 +70,4 @@ jobs:
env:
PY_TEST: ${{ github.event.inputs.test }}
run: |
pytest "$PY_TEST"
pytest "$PY_TEST"
+1 -2
View File
@@ -19,8 +19,7 @@ env:
jobs:
ssh_runner:
name: "SSH"
runs-on:
group: aws-highmemory-32-plus
runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
container:
image: ${{ github.event.inputs.docker_image }}
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
+1 -2
View File
@@ -22,8 +22,7 @@ env:
jobs:
ssh_runner:
name: "SSH"
runs-on:
group: "${{ github.event.inputs.runner_type }}"
runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
container:
image: ${{ github.event.inputs.docker_image }}
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
-4
View File
@@ -190,10 +190,6 @@
- local: conceptual/evaluation
title: Evaluating Diffusion Models
title: Conceptual Guides
- sections:
- local: community_projects
title: Projects built with Diffusers
title: Community Projects
- sections:
- isExpanded: false
sections:
-2
View File
@@ -24,8 +24,6 @@ The abstract from the paper is:
**Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).
This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
<Tip>
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-2
View File
@@ -43,8 +43,6 @@ Lumina-T2X has the following components:
* It uses a Flow-based Large Diffusion Transformer as the backbone
* It supports different any modalities with one backbone and corresponding encoder, decoder.
This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
<Tip>
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-78
View File
@@ -1,78 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Community Projects
Welcome to Community Projects. This space is dedicated to showcasing the incredible work and innovative applications created by our vibrant community using the `diffusers` library.
This section aims to:
- Highlight diverse and inspiring projects built with `diffusers`
- Foster knowledge sharing within our community
- Provide real-world examples of how `diffusers` can be leveraged
Happy exploring, and thank you for being part of the Diffusers community!
<table>
<tr>
<th>Project Name</th>
<th>Description</th>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/carson-katri/dream-textures"> dream-textures </a></td>
<td>Stable Diffusion built-in to Blender</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/megvii-research/HiDiffusion"> HiDiffusion </a></td>
<td>Increases the resolution and speed of your diffusion model by only adding a single line of code</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/lllyasviel/IC-Light"> IC-Light </a></td>
<td>IC-Light is a project to manipulate the illumination of images</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/InstantID/InstantID"> InstantID </a></td>
<td>InstantID : Zero-shot Identity-Preserving Generation in Seconds</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/Sanster/IOPaint"> IOPaint </a></td>
<td>Image inpainting tool powered by SOTA AI Model. Remove any unwanted object, defect, people from your pictures or erase and replace(powered by stable diffusion) any thing on your pictures.</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/bmaltais/kohya_ss"> Kohya </a></td>
<td>Gradio GUI for Kohya's Stable Diffusion trainers</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/magic-research/magic-animate"> MagicAnimate </a></td>
<td>MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/levihsu/OOTDiffusion"> OOTDiffusion </a></td>
<td>Outfitting Fusion based Latent Diffusion for Controllable Virtual Try-on</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/vladmandic/automatic"> SD.Next </a></td>
<td>SD.Next: Advanced Implementation of Stable Diffusion and other Diffusion-based generative image models</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/ashawkey/stable-dreamfusion"> stable-dreamfusion </a></td>
<td>Text-to-3D & Image-to-3D & Mesh Exportation with NeRF + Diffusion</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/HVision-NKU/StoryDiffusion"> StoryDiffusion </a></td>
<td>StoryDiffusion can create a magic story by generating consistent images and videos.</td>
</tr>
<tr style="border-top: 2px solid black">
<td><a href="https://github.com/cumulo-autumn/StreamDiffusion"> StreamDiffusion </a></td>
<td>A Pipeline-Level Solution for Real-Time Interactive Generation</td>
</tr>
</table>
+6 -58
View File
@@ -13,17 +13,13 @@ from diffusers.configuration_utils import FrozenDict
from diffusers.image_processor import VaeImageProcessor
from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
USE_PEFT_BACKEND,
deprecate,
logging,
scale_lora_layers,
unscale_lora_layers,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -203,7 +199,6 @@ def get_unweighted_text_embeddings(
text_input: torch.Tensor,
chunk_length: int,
no_boseos_middle: Optional[bool] = True,
clip_skip: Optional[int] = None,
):
"""
When the length of tokens is a multiple of the capacity of the text encoder,
@@ -219,20 +214,7 @@ def get_unweighted_text_embeddings(
# cover the head and the tail by the starting and the ending tokens
text_input_chunk[:, 0] = text_input[0, 0]
text_input_chunk[:, -1] = text_input[0, -1]
if clip_skip is None:
prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device))
text_embedding = prompt_embeds[0]
else:
prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device), output_hidden_states=True)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
text_embedding = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
text_embedding = pipe.text_encoder(text_input_chunk)[0]
if no_boseos_middle:
if i == 0:
@@ -248,10 +230,7 @@ def get_unweighted_text_embeddings(
text_embeddings.append(text_embedding)
text_embeddings = torch.concat(text_embeddings, axis=1)
else:
if clip_skip is None:
clip_skip = 0
prompt_embeds = pipe.text_encoder(text_input, output_hidden_states=True)[-1][-(clip_skip + 1)]
text_embeddings = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
text_embeddings = pipe.text_encoder(text_input)[0]
return text_embeddings
@@ -263,8 +242,6 @@ def get_weighted_text_embeddings(
no_boseos_middle: Optional[bool] = False,
skip_parsing: Optional[bool] = False,
skip_weighting: Optional[bool] = False,
clip_skip=None,
lora_scale=None,
):
r"""
Prompts can be assigned with local weights using brackets. For example,
@@ -291,16 +268,6 @@ def get_weighted_text_embeddings(
skip_weighting (`bool`, *optional*, defaults to `False`):
Skip the weighting. When the parsing is skipped, it is forced True.
"""
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(pipe, StableDiffusionLoraLoaderMixin):
pipe._lora_scale = lora_scale
# dynamically adjust the LoRA scale
if not USE_PEFT_BACKEND:
adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
else:
scale_lora_layers(pipe.text_encoder, lora_scale)
max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
if isinstance(prompt, str):
prompt = [prompt]
@@ -367,7 +334,10 @@ def get_weighted_text_embeddings(
# get the embeddings
text_embeddings = get_unweighted_text_embeddings(
pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, clip_skip=clip_skip
pipe,
prompt_tokens,
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
)
prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
if uncond_prompt is not None:
@@ -376,7 +346,6 @@ def get_weighted_text_embeddings(
uncond_tokens,
pipe.tokenizer.model_max_length,
no_boseos_middle=no_boseos_middle,
clip_skip=clip_skip,
)
uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
@@ -393,11 +362,6 @@ def get_weighted_text_embeddings(
current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
if pipe.text_encoder is not None:
if isinstance(pipe, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(pipe.text_encoder, lora_scale)
if uncond_prompt is not None:
return text_embeddings, uncond_embeddings
return text_embeddings, None
@@ -585,8 +549,6 @@ class StableDiffusionLongPromptWeightingPipeline(
max_embeddings_multiples=3,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
clip_skip: Optional[int] = None,
lora_scale: Optional[float] = None,
):
r"""
Encodes the prompt into text encoder hidden states.
@@ -635,8 +597,6 @@ class StableDiffusionLongPromptWeightingPipeline(
prompt=prompt,
uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
max_embeddings_multiples=max_embeddings_multiples,
clip_skip=clip_skip,
lora_scale=lora_scale,
)
if prompt_embeds is None:
prompt_embeds = prompt_embeds1
@@ -830,7 +790,6 @@ class StableDiffusionLongPromptWeightingPipeline(
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
clip_skip: Optional[int] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
@@ -906,9 +865,6 @@ class StableDiffusionLongPromptWeightingPipeline(
is_cancelled_callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. If the function returns
`True`, the inference will be cancelled.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
@@ -947,7 +903,6 @@ class StableDiffusionLongPromptWeightingPipeline(
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
@@ -959,8 +914,6 @@ class StableDiffusionLongPromptWeightingPipeline(
max_embeddings_multiples,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
lora_scale=lora_scale,
)
dtype = prompt_embeds.dtype
@@ -1091,7 +1044,6 @@ class StableDiffusionLongPromptWeightingPipeline(
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
is_cancelled_callback: Optional[Callable[[], bool]] = None,
clip_skip=None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
@@ -1149,9 +1101,6 @@ class StableDiffusionLongPromptWeightingPipeline(
is_cancelled_callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. If the function returns
`True`, the inference will be cancelled.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
@@ -1186,7 +1135,6 @@ class StableDiffusionLongPromptWeightingPipeline(
return_dict=return_dict,
callback=callback,
is_cancelled_callback=is_cancelled_callback,
clip_skip=clip_skip,
callback_steps=callback_steps,
cross_attention_kwargs=cross_attention_kwargs,
)
+6 -42
View File
@@ -25,25 +25,21 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
from diffusers.loaders import (
FromSingleFileMixin,
IPAdapterMixin,
StableDiffusionXLLoraLoaderMixin,
StableDiffusionLoraLoaderMixin,
TextualInversionLoaderMixin,
)
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
USE_PEFT_BACKEND,
deprecate,
is_accelerate_available,
is_accelerate_version,
is_invisible_watermark_available,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -265,7 +261,6 @@ def get_weighted_text_embeddings_sdxl(
num_images_per_prompt: int = 1,
device: Optional[torch.device] = None,
clip_skip: Optional[int] = None,
lora_scale: Optional[int] = None,
):
"""
This function can process long prompt with weights, no length limitation
@@ -286,24 +281,6 @@ def get_weighted_text_embeddings_sdxl(
"""
device = device or pipe._execution_device
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(pipe, StableDiffusionXLLoraLoaderMixin):
pipe._lora_scale = lora_scale
# dynamically adjust the LoRA scale
if pipe.text_encoder is not None:
if not USE_PEFT_BACKEND:
adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
else:
scale_lora_layers(pipe.text_encoder, lora_scale)
if pipe.text_encoder_2 is not None:
if not USE_PEFT_BACKEND:
adjust_lora_scale_text_encoder(pipe.text_encoder_2, lora_scale)
else:
scale_lora_layers(pipe.text_encoder_2, lora_scale)
if prompt_2:
prompt = f"{prompt} {prompt_2}"
@@ -452,16 +429,6 @@ def get_weighted_text_embeddings_sdxl(
bs_embed * num_images_per_prompt, -1
)
if pipe.text_encoder is not None:
if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(pipe.text_encoder, lora_scale)
if pipe.text_encoder_2 is not None:
if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(pipe.text_encoder_2, lora_scale)
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
@@ -582,7 +549,7 @@ class SDXLLongPromptWeightingPipeline(
StableDiffusionMixin,
FromSingleFileMixin,
IPAdapterMixin,
StableDiffusionXLLoraLoaderMixin,
StableDiffusionLoraLoaderMixin,
TextualInversionLoaderMixin,
):
r"""
@@ -594,8 +561,8 @@ class SDXLLongPromptWeightingPipeline(
The pipeline also inherits the following loading methods:
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
- [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
Args:
@@ -776,7 +743,7 @@ class SDXLLongPromptWeightingPipeline(
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
self._lora_scale = lora_scale
if prompt is not None and isinstance(prompt, str):
@@ -1645,9 +1612,7 @@ class SDXLLongPromptWeightingPipeline(
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 3. Encode input prompt
lora_scale = (
self._cross_attention_kwargs.get("scale", None) if self._cross_attention_kwargs is not None else None
)
(self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None)
negative_prompt = negative_prompt if negative_prompt is not None else ""
@@ -1662,7 +1627,6 @@ class SDXLLongPromptWeightingPipeline(
neg_prompt=negative_prompt,
num_images_per_prompt=num_images_per_prompt,
clip_skip=clip_skip,
lora_scale=lora_scale,
)
dtype = prompt_embeds.dtype
-1
View File
@@ -30,7 +30,6 @@ from .unet_loader_utils import _maybe_expand_lora_scales
_SET_ADAPTER_SCALE_FN_MAPPING = {
"UNet2DConditionModel": _maybe_expand_lora_scales,
"UNetMotionModel": _maybe_expand_lora_scales,
"SD3Transformer2DModel": lambda model_cls, weights: weights,
}
+11
View File
@@ -272,6 +272,17 @@ class BasicTransformerBlock(nn.Module):
attention_out_bias: bool = True,
):
super().__init__()
self.dim = dim
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.dropout = dropout
self.cross_attention_dim = cross_attention_dim
self.activation_fn = activation_fn
self.attention_bias = attention_bias
self.double_self_attention = double_self_attention
self.norm_elementwise_affine = norm_elementwise_affine
self.positional_embeddings = positional_embeddings
self.num_positional_embeddings = num_positional_embeddings
self.only_cross_attention = only_cross_attention
# We keep these boolean flags for backward-compatibility.
+302 -4
View File
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
@@ -19,8 +19,10 @@ import torch.nn.functional as F
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
from ...loaders import FromOriginalModelMixin, UNet2DConditionLoadersMixin
from ...utils import logging
from ...utils.torch_utils import maybe_allow_in_graph
from ..attention import FeedForward, _chunked_feed_forward
from ..attention_processor import (
ADDED_KV_ATTENTION_PROCESSORS,
CROSS_ATTENTION_PROCESSORS,
@@ -33,7 +35,7 @@ from ..attention_processor import (
IPAdapterAttnProcessor,
IPAdapterAttnProcessor2_0,
)
from ..embeddings import TimestepEmbedding, Timesteps
from ..embeddings import SinusoidalPositionalEmbedding, TimestepEmbedding, Timesteps
from ..modeling_utils import ModelMixin
from ..transformers.transformer_temporal import TransformerTemporalModel
from .unet_2d_blocks import UNetMidBlock2DCrossAttn
@@ -53,6 +55,302 @@ from .unet_3d_condition import UNet3DConditionOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@maybe_allow_in_graph
class FreeNoiseTransformerBlock(nn.Module):
r"""
A FreeNoise Transformer block.
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
num_embeds_ada_norm (:
obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
attention_bias (:
obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
only_cross_attention (`bool`, *optional*):
Whether to use only cross-attention layers. In this case two cross attention layers are used.
double_self_attention (`bool`, *optional*):
Whether to use two self-attention layers. In this case no cross attention layers are used.
upcast_attention (`bool`, *optional*):
Whether to upcast the attention computation to float32. This is useful for mixed precision training.
norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
Whether to use learnable elementwise affine parameters for normalization.
norm_type (`str`, *optional*, defaults to `"layer_norm"`):
The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
final_dropout (`bool` *optional*, defaults to False):
Whether to apply a final dropout after the last feed-forward layer.
attention_type (`str`, *optional*, defaults to `"default"`):
The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
positional_embeddings (`str`, *optional*, defaults to `None`):
The type of positional embeddings to apply to.
num_positional_embeddings (`int`, *optional*, defaults to `None`):
The maximum number of positional embeddings to apply.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout: float = 0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
double_self_attention: bool = False,
upcast_attention: bool = False,
norm_elementwise_affine: bool = True,
norm_type: str = "layer_norm",
norm_eps: float = 1e-5,
final_dropout: bool = False,
positional_embeddings: Optional[str] = None,
num_positional_embeddings: Optional[int] = None,
ff_inner_dim: Optional[int] = None,
ff_bias: bool = True,
attention_out_bias: bool = True,
context_length: int = 16,
context_stride: int = 4,
weighting_scheme: str = "pyramid",
):
super().__init__()
self.dim = dim
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.dropout = dropout
self.cross_attention_dim = cross_attention_dim
self.activation_fn = activation_fn
self.attention_bias = attention_bias
self.double_self_attention = double_self_attention
self.norm_elementwise_affine = norm_elementwise_affine
self.positional_embeddings = positional_embeddings
self.num_positional_embeddings = num_positional_embeddings
self.only_cross_attention = only_cross_attention
self.set_free_noise_properties(context_length, context_stride, weighting_scheme)
# We keep these boolean flags for backward-compatibility.
self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
self.use_layer_norm = norm_type == "layer_norm"
self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
)
self.norm_type = norm_type
self.num_embeds_ada_norm = num_embeds_ada_norm
if positional_embeddings and (num_positional_embeddings is None):
raise ValueError(
"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
)
if positional_embeddings == "sinusoidal":
self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
else:
self.pos_embed = None
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim if only_cross_attention else None,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
)
# 2. Cross-Attn
if cross_attention_dim is not None or double_self_attention:
self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim if not double_self_attention else None,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
) # is self-attn if encoder_hidden_states is none
# 3. Feed-forward
self.ff = FeedForward(
dim,
dropout=dropout,
activation_fn=activation_fn,
final_dropout=final_dropout,
inner_dim=ff_inner_dim,
bias=ff_bias,
)
self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
frame_indices = []
for i in range(0, num_frames - self.context_length + 1, self.context_stride):
window_start = i
window_end = min(num_frames, i + self.context_length)
frame_indices.append((window_start, window_end))
return frame_indices
def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
if weighting_scheme == "pyramid":
if num_frames % 2 == 0:
# num_frames = 4 => [1, 2, 2, 1]
weights = list(range(1, num_frames // 2 + 1))
weights = weights + weights[::-1]
else:
# num_frames = 5 => [1, 2, 3, 2, 1]
weights = list(range(1, num_frames // 2 + 1))
weights = weights + [num_frames // 2 + 1] + weights[::-1]
else:
raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
return weights
def set_free_noise_properties(
self, context_length: int, context_stride: int, weighting_scheme: str = "pyramid"
) -> None:
self.context_length = context_length
self.context_stride = context_stride
self.weighting_scheme = weighting_scheme
def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0) -> None:
# Sets chunk feed-forward
self._chunk_size = chunk_size
self._chunk_dim = dim
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
cross_attention_kwargs: Dict[str, Any] = None,
*args,
**kwargs,
) -> torch.Tensor:
if cross_attention_kwargs is not None:
if cross_attention_kwargs.get("scale", None) is not None:
logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
# hidden_states: [B x H x W, F, C]
device = hidden_states.device
dtype = hidden_states.dtype
num_frames = hidden_states.size(1)
frame_indices = self._get_frame_indices(num_frames)
frame_weights = self._get_frame_weights(self.context_length, self.weighting_scheme)
frame_weights = torch.tensor(frame_weights, device=device, dtype=dtype).unsqueeze(0).unsqueeze(-1)
is_last_frame_batch_complete = frame_indices[-1][1] == num_frames
# Handle out-of-bounds case if num_frames isn't perfectly divisible by context_length
# For example, num_frames=25, context_length=16, context_stride=4, then we expect the ranges:
# [(0, 16), (4, 20), (8, 24), (10, 26)]
if not is_last_frame_batch_complete:
if num_frames < self.context_length:
raise ValueError(f"Expected {num_frames=} to be greater or equal than {self.context_length=}")
last_frame_batch_length = num_frames - frame_indices[-1][1]
frame_indices.append((num_frames - self.context_length, num_frames))
num_times_accumulated = torch.zeros((1, num_frames, 1), device=device)
accumulated_values = torch.zeros_like(hidden_states)
for i, (frame_start, frame_end) in enumerate(frame_indices):
# The reason for slicing here is to ensure that if (frame_end - frame_start) is to handle
# cases like frame_indices=[(0, 16), (16, 20)], if the user provided a video with 19 frames, or
# essentially a non-multiple of `context_length`.
weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
weights *= frame_weights
hidden_states_chunk = hidden_states[:, frame_start:frame_end]
# Notice that normalization is always applied before the real computation in the following blocks.
# 1. Self-Attention
# assert self.norm_type == "layer_norm"
norm_hidden_states = self.norm1(hidden_states_chunk)
if self.pos_embed is not None:
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
attention_mask=attention_mask,
**cross_attention_kwargs,
)
hidden_states_chunk = attn_output + hidden_states_chunk
if hidden_states_chunk.ndim == 4:
hidden_states_chunk = hidden_states_chunk.squeeze(1)
# 2. Cross-Attention
if self.attn2 is not None:
norm_hidden_states = self.norm2(hidden_states_chunk)
if self.pos_embed is not None and self.norm_type != "ada_norm_single":
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
**cross_attention_kwargs,
)
hidden_states_chunk = attn_output + hidden_states_chunk
if i == len(frame_indices) - 1 and not is_last_frame_batch_complete:
accumulated_values[:, -last_frame_batch_length:] += (
hidden_states_chunk[:, -last_frame_batch_length:] * weights[:, -last_frame_batch_length:]
)
num_times_accumulated[:, -last_frame_batch_length:] += weights[:, -last_frame_batch_length]
else:
accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
num_times_accumulated[:, frame_start:frame_end] += weights
hidden_states = torch.where(
num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
).to(dtype)
# 3. Feed-forward
norm_hidden_states = self.norm3(hidden_states)
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
ff_output = self.ff(norm_hidden_states)
hidden_states = ff_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
return hidden_states
class MotionModules(nn.Module):
def __init__(
self,
@@ -231,7 +529,7 @@ class MotionAdapter(ModelMixin, ConfigMixin, FromOriginalModelMixin):
pass
class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
r"""
A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
sample shaped output.
@@ -42,6 +42,7 @@ from ...utils import (
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..free_init_utils import FreeInitMixin
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -72,6 +73,7 @@ class AnimateDiffPipeline(
IPAdapterMixin,
StableDiffusionLoraLoaderMixin,
FreeInitMixin,
AnimateDiffFreeNoiseMixin,
):
r"""
Pipeline for text-to-video generation.
@@ -394,15 +396,20 @@ class AnimateDiffPipeline(
return ip_adapter_image_embeds
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
def decode_latents(self, latents, decode_batch_size: int = 16):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
image = self.vae.decode(latents).sample
video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
video = []
for i in range(0, latents.shape[0], decode_batch_size):
batch_latents = latents[i : i + decode_batch_size]
batch_latents = self.vae.decode(batch_latents).sample
video.append(batch_latents)
video = torch.cat(video)
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
video = video.float()
return video
@@ -495,7 +502,6 @@ class AnimateDiffPipeline(
f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
)
# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
def prepare_latents(
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
):
@@ -517,6 +523,22 @@ class AnimateDiffPipeline(
else:
latents = latents.to(device)
if self.free_noise_enabled and self._free_noise_shuffle:
for i in range(self._free_noise_context_length, num_frames, self._free_noise_context_stride):
# ensure window is within bounds
window_start = max(0, i - self._free_noise_context_length)
window_end = min(num_frames, window_start + self._free_noise_context_stride)
window_length = window_end - window_start
if window_length == 0:
break
indices = torch.LongTensor(list(range(window_start, window_end)))
shuffled_indices = indices[torch.randperm(window_length, generator=generator)]
# shuffle latents in every window
latents[:, :, window_start:window_end] = latents[:, :, shuffled_indices]
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents
@@ -569,6 +591,7 @@ class AnimateDiffPipeline(
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
decode_batch_size: int = 16,
**kwargs,
):
r"""
@@ -637,6 +660,8 @@ class AnimateDiffPipeline(
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
decode_batch_size (`int`, defaults to `16`):
The number of frames to decode at a time when calling `decode_latents` method.
Examples:
@@ -808,7 +833,7 @@ class AnimateDiffPipeline(
if output_type == "latent":
video = latents
else:
video_tensor = self.decode_latents(latents)
video_tensor = self.decode_latents(latents, decode_batch_size)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
# 10. Offload all models
File diff suppressed because it is too large Load Diff
@@ -56,6 +56,7 @@ from ...utils import (
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..free_init_utils import FreeInitMixin
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -194,6 +195,7 @@ class AnimateDiffSDXLPipeline(
TextualInversionLoaderMixin,
IPAdapterMixin,
FreeInitMixin,
AnimateDiffFreeNoiseMixin,
):
r"""
Pipeline for text-to-video generation using Stable Diffusion XL.
@@ -606,15 +608,21 @@ class AnimateDiffSDXLPipeline(
return ip_adapter_image_embeds
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
def decode_latents(self, latents, decode_batch_size: int = 16):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
image = self.vae.decode(latents).sample
video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
video = []
for i in range(0, latents.shape[0], decode_batch_size):
batch_latents = latents[i : i + decode_batch_size]
batch_latents = self.vae.decode(batch_latents).sample
video.append(batch_latents)
video = torch.cat(video)
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
video = video.float()
return video
@@ -876,6 +884,7 @@ class AnimateDiffSDXLPipeline(
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
decode_batch_size: int = 16,
):
r"""
Function invoked when calling the pipeline for generation.
@@ -1015,6 +1024,8 @@ class AnimateDiffSDXLPipeline(
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
decode_batch_size (`int`, defaults to `16`):
The number of frames to decode at a time when calling `decode_latents` method.
Examples:
@@ -1258,7 +1269,7 @@ class AnimateDiffSDXLPipeline(
if output_type == "latent":
video = latents
else:
video_tensor = self.decode_latents(latents)
video_tensor = self.decode_latents(latents, decode_batch_size)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
# cast back to fp16 if needed
@@ -38,6 +38,7 @@ from ...utils import (
from ...utils.torch_utils import is_compiled_module, randn_tensor
from ...video_processor import VideoProcessor
from ..free_init_utils import FreeInitMixin
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -127,6 +128,7 @@ class AnimateDiffSparseControlNetPipeline(
IPAdapterMixin,
StableDiffusionLoraLoaderMixin,
FreeInitMixin,
AnimateDiffFreeNoiseMixin,
):
r"""
Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
@@ -448,15 +450,21 @@ class AnimateDiffSparseControlNetPipeline(
return ip_adapter_image_embeds
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
def decode_latents(self, latents, decode_batch_size: int = 16):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
image = self.vae.decode(latents).sample
video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
video = []
for i in range(0, latents.shape[0], decode_batch_size):
batch_latents = latents[i : i + decode_batch_size]
batch_latents = self.vae.decode(batch_latents).sample
video.append(batch_latents)
video = torch.cat(video)
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
video = video.float()
return video
@@ -728,6 +736,7 @@ class AnimateDiffSparseControlNetPipeline(
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
decode_batch_size: int = 16,
):
r"""
The call function to the pipeline for generation.
@@ -806,6 +815,8 @@ class AnimateDiffSparseControlNetPipeline(
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
decode_batch_size (`int`, defaults to `16`):
The number of frames to decode at a time when calling `decode_latents` method.
Examples:
@@ -996,7 +1007,7 @@ class AnimateDiffSparseControlNetPipeline(
if output_type == "latent":
video = latents
else:
video_tensor = self.decode_latents(latents)
video_tensor = self.decode_latents(latents, decode_batch_size)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
# 12. Offload all models
@@ -35,6 +35,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..free_init_utils import FreeInitMixin
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -176,6 +177,7 @@ class AnimateDiffVideoToVideoPipeline(
IPAdapterMixin,
StableDiffusionLoraLoaderMixin,
FreeInitMixin,
AnimateDiffFreeNoiseMixin,
):
r"""
Pipeline for video-to-video generation.
@@ -498,15 +500,21 @@ class AnimateDiffVideoToVideoPipeline(
return ip_adapter_image_embeds
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
def decode_latents(self, latents, decode_batch_size: int = 16):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
image = self.vae.decode(latents).sample
video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
video = []
for i in range(0, latents.shape[0], decode_batch_size):
batch_latents = latents[i : i + decode_batch_size]
batch_latents = self.vae.decode(batch_latents).sample
video.append(batch_latents)
video = torch.cat(video)
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
video = video.float()
return video
@@ -747,6 +755,7 @@ class AnimateDiffVideoToVideoPipeline(
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
decode_batch_size: int = 16,
):
r"""
The call function to the pipeline for generation.
@@ -822,6 +831,8 @@ class AnimateDiffVideoToVideoPipeline(
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
decode_batch_size (`int`, defaults to `16`):
The number of frames to decode at a time when calling `decode_latents` method.
Examples:
@@ -990,7 +1001,7 @@ class AnimateDiffVideoToVideoPipeline(
if output_type == "latent":
video = latents
else:
video_tensor = self.decode_latents(latents)
video_tensor = self.decode_latents(latents, decode_batch_size)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
# 10. Offload all models
+141
View File
@@ -0,0 +1,141 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
from ..models.attention import BasicTransformerBlock
from ..models.unets.unet_motion_model import (
CrossAttnDownBlockMotion,
DownBlockMotion,
FreeNoiseTransformerBlock,
TransformerTemporalModel,
UpBlockMotion,
)
class AnimateDiffFreeNoiseMixin:
r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169)."""
def _enable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
r"""Helper function to enable FreeNoise in transformer blocks."""
for motion_module in block.motion_modules:
motion_module: TransformerTemporalModel
num_transformer_blocks = len(motion_module.transformer_blocks)
for i in range(num_transformer_blocks):
if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
motion_module.transformer_blocks[i].set_free_noise_properties(
self._free_noise_context_length,
self._free_noise_context_stride,
self._free_noise_weighting_scheme,
)
else:
assert isinstance(motion_module.transformer_blocks[i], BasicTransformerBlock)
basic_transfomer_block = motion_module.transformer_blocks[i]
motion_module.transformer_blocks[i] = FreeNoiseTransformerBlock(
dim=basic_transfomer_block.dim,
num_attention_heads=basic_transfomer_block.num_attention_heads,
attention_head_dim=basic_transfomer_block.attention_head_dim,
dropout=basic_transfomer_block.dropout,
cross_attention_dim=basic_transfomer_block.cross_attention_dim,
activation_fn=basic_transfomer_block.activation_fn,
attention_bias=basic_transfomer_block.attention_bias,
only_cross_attention=basic_transfomer_block.only_cross_attention,
double_self_attention=basic_transfomer_block.double_self_attention,
positional_embeddings=basic_transfomer_block.positional_embeddings,
num_positional_embeddings=basic_transfomer_block.num_positional_embeddings,
context_length=self._free_noise_context_length,
context_stride=self._free_noise_context_stride,
weighting_scheme=self._free_noise_weighting_scheme,
).to(device=self.device, dtype=self.dtype)
motion_module.transformer_blocks[i].load_state_dict(
basic_transfomer_block.state_dict(), strict=True
)
def _disable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
r"""Helper function to disable FreeNoise in transformer blocks."""
for motion_module in block.motion_modules:
motion_module: TransformerTemporalModel
num_transformer_blocks = len(motion_module.transformer_blocks)
for i in range(num_transformer_blocks):
if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
free_noise_transfomer_block = motion_module.transformer_blocks[i]
motion_module.transformer_blocks[i] = BasicTransformerBlock(
dim=free_noise_transfomer_block.dim,
num_attention_heads=free_noise_transfomer_block.num_attention_heads,
attention_head_dim=free_noise_transfomer_block.attention_head_dim,
dropout=free_noise_transfomer_block.dropout,
cross_attention_dim=free_noise_transfomer_block.cross_attention_dim,
activation_fn=free_noise_transfomer_block.activation_fn,
attention_bias=free_noise_transfomer_block.attention_bias,
only_cross_attention=free_noise_transfomer_block.only_cross_attention,
double_self_attention=free_noise_transfomer_block.double_self_attention,
positional_embeddings=free_noise_transfomer_block.positional_embeddings,
num_positional_embeddings=free_noise_transfomer_block.num_positional_embeddings,
).to(device=self.device, dtype=self.dtype)
motion_module.transformer_blocks[i].load_state_dict(
free_noise_transfomer_block.state_dict(), strict=True
)
def enable_free_noise(
self,
context_length: Optional[int] = 16,
context_stride: int = 4,
weighting_scheme: str = "pyramid",
shuffle: bool = True,
) -> None:
r"""
Enable long video generation using FreeNoise.
Args:
context_length (`int`, defaults to `16`, *optional*):
The number of video frames to process at once. It's recommended to set this to the maximum frames the
Motion Adapter was trained with (usually 16/24/32). If `None`, the default value from the motion
adapter config is used.
context_stride (`int`, *optional*):
Long videos are generated by processing many frames. FreeNoise processes these frames in sliding
windows of size `context_length`. Context stride allows you to specify how many frames to skip between
each window. For example, a context length of 16 and context stride of 4 would process 24 frames as:
[0, 15], [4, 19], [8, 23] (0-based indexing)
weighting_scheme (`str`, defaults to `4`):
TODO(aryan)
shuffle (`str`, defaults to `True`):
TODO(aryan): decide if this is even needed
"""
self._free_noise_context_length = context_length or self.motion_adapter.config.motion_max_seq_length
self._free_noise_context_stride = context_stride
self._free_noise_weighting_scheme = weighting_scheme
self._free_noise_shuffle = shuffle
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
for block in blocks:
self._enable_free_noise_in_block(block)
def disable_free_noise(self) -> None:
self._free_noise_context_length = None
blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
for block in blocks:
self._disable_free_noise_in_block(block)
@property
def free_noise_enabled(self):
return hasattr(self, "_free_noise_context_length") and self._free_noise_context_length is not None
+16 -5
View File
@@ -45,6 +45,7 @@ from ...utils import (
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..free_init_utils import FreeInitMixin
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -131,6 +132,7 @@ class PIAPipeline(
StableDiffusionLoraLoaderMixin,
FromSingleFileMixin,
FreeInitMixin,
AnimateDiffFreeNoiseMixin,
):
r"""
Pipeline for text-to-video generation.
@@ -407,15 +409,21 @@ class PIAPipeline(
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
def decode_latents(self, latents):
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
def decode_latents(self, latents, decode_batch_size: int = 16):
latents = 1 / self.vae.config.scaling_factor * latents
batch_size, channels, num_frames, height, width = latents.shape
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
image = self.vae.decode(latents).sample
video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
video = []
for i in range(0, latents.shape[0], decode_batch_size):
batch_latents = latents[i : i + decode_batch_size]
batch_latents = self.vae.decode(batch_latents).sample
video.append(batch_latents)
video = torch.cat(video)
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
video = video.float()
return video
@@ -687,6 +695,7 @@ class PIAPipeline(
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
decode_batch_size: int = 16,
):
r"""
The call function to the pipeline for generation.
@@ -763,6 +772,8 @@ class PIAPipeline(
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
decode_batch_size (`int`, defaults to `16`):
The number of frames to decode at a time when calling `decode_latents` method.
Examples:
@@ -931,7 +942,7 @@ class PIAPipeline(
if output_type == "latent":
video = latents
else:
video_tensor = self.decode_latents(latents)
video_tensor = self.decode_latents(latents, decode_batch_size)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
# 10. Offload all models