add experimental support for num_frames not perfectly fitting context length, ocntext stride

copy animatediff controlnet implementation from #8972
make style
2024-07-27 16:24:36 +02:00 · 2024-07-27 15:34:39 +02:00 · 2024-07-27 15:31:12 +02:00 · 2024-07-27 15:30:42 +02:00 · 2024-07-27 15:29:30 +02:00 · 2024-07-27 15:27:28 +02:00
26 changed files with 1665 additions and 275 deletions
@@ -20,8 +20,7 @@ env:

 jobs:
  test-build-docker-images:
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -51,8 +50,7 @@ jobs:
        if: steps.file_changes.outputs.all != ''

  build-and-push-docker-images:
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name != 'pull_request'

    permissions:
@@ -100,4 +98,4 @@ jobs:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@@ -19,8 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -56,8 +55,7 @@ jobs:
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -107,8 +105,7 @@ jobs:

  run_nightly_tests_for_other_torch_modules:
    name: Nightly Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -237,8 +234,7 @@ jobs:

  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host
@@ -15,8 +15,7 @@ concurrency:
 jobs:
  setup_pr_tests:
    name: Setup PR Tests
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -74,8 +73,7 @@ jobs:
      max-parallel: 2
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -125,13 +123,12 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

    name: ${{ matrix.config.name }}
-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}
    container:
      image: ${{ matrix.config.image }}
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -71,8 +71,7 @@ jobs:

    name: LoRA - ${{ matrix.lib-versions }}

-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -129,4 +128,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
-        path: reports
+        path: reports
@@ -77,29 +77,28 @@ jobs:
        config:
          - name: Fast PyTorch Pipeline CPU tests
            framework: pytorch_pipelines
-            runner: aws-highmemory-32-plus
+            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -181,8 +180,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner:
-              group: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -19,8 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -58,8 +57,7 @@ jobs:
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -103,8 +101,7 @@ jobs:

  torch_cuda_tests:
    name: Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -204,8 +201,7 @@ jobs:

  onnx_cuda_tests:
    name: ONNX CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -253,8 +249,7 @@ jobs:
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-compile-cuda
@@ -296,8 +291,7 @@ jobs:
  run_xformers_tests:
    name: PyTorch xformers CUDA tests

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
@@ -338,8 +332,7 @@ jobs:
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-cuda
@@ -29,29 +29,28 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: aws-general-8-plus
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -26,8 +26,7 @@ env:
 jobs:
  run_tests:
    name: "Run a test on our runner from a PR"
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -71,4 +70,4 @@ jobs:
        env:
            PY_TEST: ${{ github.event.inputs.test }}
        run: |
-          pytest "$PY_TEST"
+          pytest "$PY_TEST"
@@ -19,8 +19,7 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on:
-      group: aws-highmemory-32-plus
+    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
@@ -22,8 +22,7 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on:
-      group: "${{ github.event.inputs.runner_type }}"
+    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
@@ -190,10 +190,6 @@
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
  title: Conceptual Guides
- sections:
-  - local: community_projects
-    title: Projects built with Diffusers
-  title: Community Projects
 - sections:
  - isExpanded: false
    sections:
@@ -24,8 +24,6 @@ The abstract from the paper is:

 **Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).

-This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
-
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -43,8 +43,6 @@ Lumina-T2X has the following components:
 * It uses a Flow-based Large Diffusion Transformer as the backbone
 * It supports different any modalities with one backbone and corresponding encoder, decoder.

-This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
-
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -1,78 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Community Projects
-
-Welcome to Community Projects. This space is dedicated to showcasing the incredible work and innovative applications created by our vibrant community using the `diffusers` library.
-
-This section aims to:
-
- Highlight diverse and inspiring projects built with `diffusers`
- Foster knowledge sharing within our community
- Provide real-world examples of how `diffusers` can be leveraged
-
-Happy exploring, and thank you for being part of the Diffusers community!
-
-<table>
-    <tr>
-        <th>Project Name</th>
-        <th>Description</th>
-    </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/carson-katri/dream-textures"> dream-textures </a></td>
-    <td>Stable Diffusion built-in to Blender</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/megvii-research/HiDiffusion"> HiDiffusion </a></td>
-    <td>Increases the resolution and speed of your diffusion model by only adding a single line of code</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/lllyasviel/IC-Light"> IC-Light </a></td>
-    <td>IC-Light is a project to manipulate the illumination of images</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/InstantID/InstantID"> InstantID </a></td>
-    <td>InstantID : Zero-shot Identity-Preserving Generation in Seconds</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/Sanster/IOPaint"> IOPaint </a></td>
-    <td>Image inpainting tool powered by SOTA AI Model. Remove any unwanted object, defect, people from your pictures or erase and replace(powered by stable diffusion) any thing on your pictures.</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/bmaltais/kohya_ss"> Kohya </a></td>
-    <td>Gradio GUI for Kohya's Stable Diffusion trainers</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/magic-research/magic-animate"> MagicAnimate </a></td>
-    <td>MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/levihsu/OOTDiffusion"> OOTDiffusion </a></td>
-    <td>Outfitting Fusion based Latent Diffusion for Controllable Virtual Try-on</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/vladmandic/automatic"> SD.Next </a></td>
-    <td>SD.Next: Advanced Implementation of Stable Diffusion and other Diffusion-based generative image models</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/ashawkey/stable-dreamfusion"> stable-dreamfusion </a></td>
-    <td>Text-to-3D & Image-to-3D & Mesh Exportation with NeRF + Diffusion</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/HVision-NKU/StoryDiffusion"> StoryDiffusion </a></td>
-    <td>StoryDiffusion can create a magic story by generating consistent images and videos.</td>
-  </tr>
-  <tr style="border-top: 2px solid black">
-    <td><a href="https://github.com/cumulo-autumn/StreamDiffusion"> StreamDiffusion </a></td>
-    <td>A Pipeline-Level Solution for Real-Time Interactive Generation</td>
-  </tr>
-</table>
@@ -13,17 +13,13 @@ from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
    PIL_INTERPOLATION,
-    USE_PEFT_BACKEND,
    deprecate,
    logging,
-    scale_lora_layers,
-    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -203,7 +199,6 @@ def get_unweighted_text_embeddings(
    text_input: torch.Tensor,
    chunk_length: int,
    no_boseos_middle: Optional[bool] = True,
-    clip_skip: Optional[int] = None,
 ):
    """
    When the length of tokens is a multiple of the capacity of the text encoder,
@@ -219,20 +214,7 @@ def get_unweighted_text_embeddings(
            # cover the head and the tail by the starting and the ending tokens
            text_input_chunk[:, 0] = text_input[0, 0]
            text_input_chunk[:, -1] = text_input[0, -1]
-            if clip_skip is None:
-                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device))
-                text_embedding = prompt_embeds[0]
-            else:
-                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device), output_hidden_states=True)
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                text_embedding = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+            text_embedding = pipe.text_encoder(text_input_chunk)[0]

            if no_boseos_middle:
                if i == 0:
@@ -248,10 +230,7 @@ def get_unweighted_text_embeddings(
            text_embeddings.append(text_embedding)
        text_embeddings = torch.concat(text_embeddings, axis=1)
    else:
-        if clip_skip is None:
-            clip_skip = 0
-        prompt_embeds = pipe.text_encoder(text_input, output_hidden_states=True)[-1][-(clip_skip + 1)]
-        text_embeddings = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+        text_embeddings = pipe.text_encoder(text_input)[0]
    return text_embeddings


@@ -263,8 +242,6 @@ def get_weighted_text_embeddings(
    no_boseos_middle: Optional[bool] = False,
    skip_parsing: Optional[bool] = False,
    skip_weighting: Optional[bool] = False,
-    clip_skip=None,
-    lora_scale=None,
 ):
    r"""
    Prompts can be assigned with local weights using brackets. For example,
@@ -291,16 +268,6 @@ def get_weighted_text_embeddings(
        skip_weighting (`bool`, *optional*, defaults to `False`):
            Skip the weighting. When the parsing is skipped, it is forced True.
    """
-    # set lora scale so that monkey patched LoRA
-    # function of text encoder can correctly access it
-    if lora_scale is not None and isinstance(pipe, StableDiffusionLoraLoaderMixin):
-        pipe._lora_scale = lora_scale
-
-        # dynamically adjust the LoRA scale
-        if not USE_PEFT_BACKEND:
-            adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
-        else:
-            scale_lora_layers(pipe.text_encoder, lora_scale)
    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
    if isinstance(prompt, str):
        prompt = [prompt]
@@ -367,7 +334,10 @@ def get_weighted_text_embeddings(

    # get the embeddings
    text_embeddings = get_unweighted_text_embeddings(
-        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, clip_skip=clip_skip
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
    )
    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
    if uncond_prompt is not None:
@@ -376,7 +346,6 @@ def get_weighted_text_embeddings(
            uncond_tokens,
            pipe.tokenizer.model_max_length,
            no_boseos_middle=no_boseos_middle,
-            clip_skip=clip_skip,
        )
        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)

@@ -393,11 +362,6 @@ def get_weighted_text_embeddings(
            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)

-    if pipe.text_encoder is not None:
-        if isinstance(pipe, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder, lora_scale)
-
    if uncond_prompt is not None:
        return text_embeddings, uncond_embeddings
    return text_embeddings, None
@@ -585,8 +549,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        max_embeddings_multiples=3,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        clip_skip: Optional[int] = None,
-        lora_scale: Optional[float] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -635,8 +597,6 @@ class StableDiffusionLongPromptWeightingPipeline(
                prompt=prompt,
                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
                max_embeddings_multiples=max_embeddings_multiples,
-                clip_skip=clip_skip,
-                lora_scale=lora_scale,
            )
            if prompt_embeds is None:
                prompt_embeds = prompt_embeds1
@@ -830,7 +790,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        clip_skip: Optional[int] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -906,9 +865,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -947,7 +903,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
-        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None

        # 3. Encode input prompt
        prompt_embeds = self._encode_prompt(
@@ -959,8 +914,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            max_embeddings_multiples,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
-            clip_skip=clip_skip,
-            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -1091,7 +1044,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        clip_skip=None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -1149,9 +1101,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -1186,7 +1135,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            return_dict=return_dict,
            callback=callback,
            is_cancelled_callback=is_cancelled_callback,
-            clip_skip=clip_skip,
            callback_steps=callback_steps,
            cross_attention_kwargs=cross_attention_kwargs,
        )
@@ -25,25 +25,21 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import (
    FromSingleFileMixin,
    IPAdapterMixin,
-    StableDiffusionXLLoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
-from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
-    USE_PEFT_BACKEND,
    deprecate,
    is_accelerate_available,
    is_accelerate_version,
    is_invisible_watermark_available,
    logging,
    replace_example_docstring,
-    scale_lora_layers,
-    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -265,7 +261,6 @@ def get_weighted_text_embeddings_sdxl(
    num_images_per_prompt: int = 1,
    device: Optional[torch.device] = None,
    clip_skip: Optional[int] = None,
-    lora_scale: Optional[int] = None,
 ):
    """
    This function can process long prompt with weights, no length limitation
@@ -286,24 +281,6 @@ def get_weighted_text_embeddings_sdxl(
    """
    device = device or pipe._execution_device

-    # set lora scale so that monkey patched LoRA
-    # function of text encoder can correctly access it
-    if lora_scale is not None and isinstance(pipe, StableDiffusionXLLoraLoaderMixin):
-        pipe._lora_scale = lora_scale
-
-        # dynamically adjust the LoRA scale
-        if pipe.text_encoder is not None:
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(pipe.text_encoder, lora_scale)
-
-        if pipe.text_encoder_2 is not None:
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(pipe.text_encoder_2, lora_scale)
-            else:
-                scale_lora_layers(pipe.text_encoder_2, lora_scale)
-
    if prompt_2:
        prompt = f"{prompt} {prompt_2}"

@@ -452,16 +429,6 @@ def get_weighted_text_embeddings_sdxl(
        bs_embed * num_images_per_prompt, -1
    )

-    if pipe.text_encoder is not None:
-        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder, lora_scale)
-
-    if pipe.text_encoder_2 is not None:
-        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder_2, lora_scale)
-
    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds


@@ -582,7 +549,7 @@ class SDXLLongPromptWeightingPipeline(
    StableDiffusionMixin,
    FromSingleFileMixin,
    IPAdapterMixin,
-    StableDiffusionXLLoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 ):
    r"""
@@ -594,8 +561,8 @@ class SDXLLongPromptWeightingPipeline(
    The pipeline also inherits the following loading methods:
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings

    Args:
@@ -776,7 +743,7 @@ class SDXLLongPromptWeightingPipeline(

        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

        if prompt is not None and isinstance(prompt, str):
@@ -1645,9 +1612,7 @@ class SDXLLongPromptWeightingPipeline(
                image_embeds = torch.cat([negative_image_embeds, image_embeds])

        # 3. Encode input prompt
-        lora_scale = (
-            self._cross_attention_kwargs.get("scale", None) if self._cross_attention_kwargs is not None else None
-        )
+        (self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None)

        negative_prompt = negative_prompt if negative_prompt is not None else ""

@@ -1662,7 +1627,6 @@ class SDXLLongPromptWeightingPipeline(
            neg_prompt=negative_prompt,
            num_images_per_prompt=num_images_per_prompt,
            clip_skip=clip_skip,
-            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -30,7 +30,6 @@ from .unet_loader_utils import _maybe_expand_lora_scales

 _SET_ADAPTER_SCALE_FN_MAPPING = {
    "UNet2DConditionModel": _maybe_expand_lora_scales,
-    "UNetMotionModel": _maybe_expand_lora_scales,
    "SD3Transformer2DModel": lambda model_cls, weights: weights,
 }

@@ -272,6 +272,17 @@ class BasicTransformerBlock(nn.Module):
        attention_out_bias: bool = True,
    ):
        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
        self.only_cross_attention = only_cross_attention

        # We keep these boolean flags for backward-compatibility.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
@@ -19,8 +19,10 @@ import torch.nn.functional as F
 import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
-from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...loaders import FromOriginalModelMixin, UNet2DConditionLoadersMixin
 from ...utils import logging
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import FeedForward, _chunked_feed_forward
 from ..attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
    CROSS_ATTENTION_PROCESSORS,
@@ -33,7 +35,7 @@ from ..attention_processor import (
    IPAdapterAttnProcessor,
    IPAdapterAttnProcessor2_0,
 )
-from ..embeddings import TimestepEmbedding, Timesteps
+from ..embeddings import SinusoidalPositionalEmbedding, TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
 from ..transformers.transformer_temporal import TransformerTemporalModel
 from .unet_2d_blocks import UNetMidBlock2DCrossAttn
@@ -53,6 +55,302 @@ from .unet_3d_condition import UNet3DConditionOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


+@maybe_allow_in_graph
+class FreeNoiseTransformerBlock(nn.Module):
+    r"""
+    A FreeNoise Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+        context_length: int = 16,
+        context_stride: int = 4,
+        weighting_scheme: str = "pyramid",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+
+        self.set_free_noise_properties(context_length, context_stride, weighting_scheme)
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+
+        # 3. Feed-forward
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
+        frame_indices = []
+        for i in range(0, num_frames - self.context_length + 1, self.context_stride):
+            window_start = i
+            window_end = min(num_frames, i + self.context_length)
+            frame_indices.append((window_start, window_end))
+
+        return frame_indices
+
+    def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
+        if weighting_scheme == "pyramid":
+            if num_frames % 2 == 0:
+                # num_frames = 4 => [1, 2, 2, 1]
+                weights = list(range(1, num_frames // 2 + 1))
+                weights = weights + weights[::-1]
+            else:
+                # num_frames = 5 => [1, 2, 3, 2, 1]
+                weights = list(range(1, num_frames // 2 + 1))
+                weights = weights + [num_frames // 2 + 1] + weights[::-1]
+        else:
+            raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
+
+        return weights
+
+    def set_free_noise_properties(
+        self, context_length: int, context_stride: int, weighting_scheme: str = "pyramid"
+    ) -> None:
+        self.context_length = context_length
+        self.context_stride = context_stride
+        self.weighting_scheme = weighting_scheme
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0) -> None:
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        # hidden_states: [B x H x W, F, C]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+
+        num_frames = hidden_states.size(1)
+        frame_indices = self._get_frame_indices(num_frames)
+        frame_weights = self._get_frame_weights(self.context_length, self.weighting_scheme)
+        frame_weights = torch.tensor(frame_weights, device=device, dtype=dtype).unsqueeze(0).unsqueeze(-1)
+        is_last_frame_batch_complete = frame_indices[-1][1] == num_frames
+
+        # Handle out-of-bounds case if num_frames isn't perfectly divisible by context_length
+        # For example, num_frames=25, context_length=16, context_stride=4, then we expect the ranges:
+        #    [(0, 16), (4, 20), (8, 24), (10, 26)]
+        if not is_last_frame_batch_complete:
+            if num_frames < self.context_length:
+                raise ValueError(f"Expected {num_frames=} to be greater or equal than {self.context_length=}")
+            last_frame_batch_length = num_frames - frame_indices[-1][1]
+            frame_indices.append((num_frames - self.context_length, num_frames))
+
+        num_times_accumulated = torch.zeros((1, num_frames, 1), device=device)
+        accumulated_values = torch.zeros_like(hidden_states)
+
+        for i, (frame_start, frame_end) in enumerate(frame_indices):
+            # The reason for slicing here is to ensure that if (frame_end - frame_start) is to handle
+            # cases like frame_indices=[(0, 16), (16, 20)], if the user provided a video with 19 frames, or
+            # essentially a non-multiple of `context_length`.
+            weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
+            weights *= frame_weights
+
+            hidden_states_chunk = hidden_states[:, frame_start:frame_end]
+
+            # Notice that normalization is always applied before the real computation in the following blocks.
+            # 1. Self-Attention
+            # assert self.norm_type == "layer_norm"
+            norm_hidden_states = self.norm1(hidden_states_chunk)
+
+            if self.pos_embed is not None:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states_chunk = attn_output + hidden_states_chunk
+            if hidden_states_chunk.ndim == 4:
+                hidden_states_chunk = hidden_states_chunk.squeeze(1)
+
+            # 2. Cross-Attention
+            if self.attn2 is not None:
+                norm_hidden_states = self.norm2(hidden_states_chunk)
+
+                if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                    norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states_chunk = attn_output + hidden_states_chunk
+
+            if i == len(frame_indices) - 1 and not is_last_frame_batch_complete:
+                accumulated_values[:, -last_frame_batch_length:] += (
+                    hidden_states_chunk[:, -last_frame_batch_length:] * weights[:, -last_frame_batch_length:]
+                )
+                num_times_accumulated[:, -last_frame_batch_length:] += weights[:, -last_frame_batch_length]
+            else:
+                accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
+                num_times_accumulated[:, frame_start:frame_end] += weights
+
+        hidden_states = torch.where(
+            num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
+        ).to(dtype)
+
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
 class MotionModules(nn.Module):
    def __init__(
        self,
@@ -231,7 +529,7 @@ class MotionAdapter(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        pass


-class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
    r"""
    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
    sample shaped output.
@@ -42,6 +42,7 @@ from ...utils import (
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -72,6 +73,7 @@ class AnimateDiffPipeline(
    IPAdapterMixin,
    StableDiffusionLoraLoaderMixin,
    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
 ):
    r"""
    Pipeline for text-to-video generation.
@@ -394,15 +396,20 @@ class AnimateDiffPipeline(

        return ip_adapter_image_embeds

-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
+    def decode_latents(self, latents, decode_batch_size: int = 16):
        latents = 1 / self.vae.config.scaling_factor * latents

        batch_size, channels, num_frames, height, width = latents.shape
        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

-        image = self.vae.decode(latents).sample
-        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        video = []
+        for i in range(0, latents.shape[0], decode_batch_size):
+            batch_latents = latents[i : i + decode_batch_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        video = video.float()
        return video
@@ -495,7 +502,6 @@ class AnimateDiffPipeline(
                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

-    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
    def prepare_latents(
        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
    ):
@@ -517,6 +523,22 @@ class AnimateDiffPipeline(
        else:
            latents = latents.to(device)

+        if self.free_noise_enabled and self._free_noise_shuffle:
+            for i in range(self._free_noise_context_length, num_frames, self._free_noise_context_stride):
+                # ensure window is within bounds
+                window_start = max(0, i - self._free_noise_context_length)
+                window_end = min(num_frames, window_start + self._free_noise_context_stride)
+                window_length = window_end - window_start
+
+                if window_length == 0:
+                    break
+
+                indices = torch.LongTensor(list(range(window_start, window_end)))
+                shuffled_indices = indices[torch.randperm(window_length, generator=generator)]
+
+                # shuffle latents in every window
+                latents[:, :, window_start:window_end] = latents[:, :, shuffled_indices]
+
        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma
        return latents
@@ -569,6 +591,7 @@ class AnimateDiffPipeline(
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_batch_size: int = 16,
        **kwargs,
    ):
        r"""
@@ -637,6 +660,8 @@ class AnimateDiffPipeline(
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_batch_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

@@ -808,7 +833,7 @@ class AnimateDiffPipeline(
        if output_type == "latent":
            video = latents
        else:
-            video_tensor = self.decode_latents(latents)
+            video_tensor = self.decode_latents(latents, decode_batch_size)
            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

        # 10. Offload all models
@@ -56,6 +56,7 @@ from ...utils import (
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -194,6 +195,7 @@ class AnimateDiffSDXLPipeline(
    TextualInversionLoaderMixin,
    IPAdapterMixin,
    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
 ):
    r"""
    Pipeline for text-to-video generation using Stable Diffusion XL.
@@ -606,15 +608,21 @@ class AnimateDiffSDXLPipeline(

        return ip_adapter_image_embeds

-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_batch_size: int = 16):
        latents = 1 / self.vae.config.scaling_factor * latents

        batch_size, channels, num_frames, height, width = latents.shape
        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

-        image = self.vae.decode(latents).sample
-        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        video = []
+        for i in range(0, latents.shape[0], decode_batch_size):
+            batch_latents = latents[i : i + decode_batch_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        video = video.float()
        return video
@@ -876,6 +884,7 @@ class AnimateDiffSDXLPipeline(
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_batch_size: int = 16,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -1015,6 +1024,8 @@ class AnimateDiffSDXLPipeline(
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_batch_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

@@ -1258,7 +1269,7 @@ class AnimateDiffSDXLPipeline(
        if output_type == "latent":
            video = latents
        else:
-            video_tensor = self.decode_latents(latents)
+            video_tensor = self.decode_latents(latents, decode_batch_size)
            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

        # cast back to fp16 if needed
@@ -38,6 +38,7 @@ from ...utils import (
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ...video_processor import VideoProcessor
 from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -127,6 +128,7 @@ class AnimateDiffSparseControlNetPipeline(
    IPAdapterMixin,
    StableDiffusionLoraLoaderMixin,
    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
 ):
    r"""
    Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
@@ -448,15 +450,21 @@ class AnimateDiffSparseControlNetPipeline(

        return ip_adapter_image_embeds

-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_batch_size: int = 16):
        latents = 1 / self.vae.config.scaling_factor * latents

        batch_size, channels, num_frames, height, width = latents.shape
        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

-        image = self.vae.decode(latents).sample
-        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        video = []
+        for i in range(0, latents.shape[0], decode_batch_size):
+            batch_latents = latents[i : i + decode_batch_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        video = video.float()
        return video
@@ -728,6 +736,7 @@ class AnimateDiffSparseControlNetPipeline(
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_batch_size: int = 16,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -806,6 +815,8 @@ class AnimateDiffSparseControlNetPipeline(
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_batch_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

@@ -996,7 +1007,7 @@ class AnimateDiffSparseControlNetPipeline(
        if output_type == "latent":
            video = latents
        else:
-            video_tensor = self.decode_latents(latents)
+            video_tensor = self.decode_latents(latents, decode_batch_size)
            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

        # 12. Offload all models
@@ -35,6 +35,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -176,6 +177,7 @@ class AnimateDiffVideoToVideoPipeline(
    IPAdapterMixin,
    StableDiffusionLoraLoaderMixin,
    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
 ):
    r"""
    Pipeline for video-to-video generation.
@@ -498,15 +500,21 @@ class AnimateDiffVideoToVideoPipeline(

        return ip_adapter_image_embeds

-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_batch_size: int = 16):
        latents = 1 / self.vae.config.scaling_factor * latents

        batch_size, channels, num_frames, height, width = latents.shape
        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

-        image = self.vae.decode(latents).sample
-        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        video = []
+        for i in range(0, latents.shape[0], decode_batch_size):
+            batch_latents = latents[i : i + decode_batch_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        video = video.float()
        return video
@@ -747,6 +755,7 @@ class AnimateDiffVideoToVideoPipeline(
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_batch_size: int = 16,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -822,6 +831,8 @@ class AnimateDiffVideoToVideoPipeline(
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_batch_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

@@ -990,7 +1001,7 @@ class AnimateDiffVideoToVideoPipeline(
        if output_type == "latent":
            video = latents
        else:
-            video_tensor = self.decode_latents(latents)
+            video_tensor = self.decode_latents(latents, decode_batch_size)
            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

        # 10. Offload all models
@@ -0,0 +1,141 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+from ..models.attention import BasicTransformerBlock
+from ..models.unets.unet_motion_model import (
+    CrossAttnDownBlockMotion,
+    DownBlockMotion,
+    FreeNoiseTransformerBlock,
+    TransformerTemporalModel,
+    UpBlockMotion,
+)
+
+
+class AnimateDiffFreeNoiseMixin:
+    r"""Mixin class for [FreeNoise](https://arxiv.org/abs/2310.15169)."""
+
+    def _enable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
+        r"""Helper function to enable FreeNoise in transformer blocks."""
+
+        for motion_module in block.motion_modules:
+            motion_module: TransformerTemporalModel
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    motion_module.transformer_blocks[i].set_free_noise_properties(
+                        self._free_noise_context_length,
+                        self._free_noise_context_stride,
+                        self._free_noise_weighting_scheme,
+                    )
+                else:
+                    assert isinstance(motion_module.transformer_blocks[i], BasicTransformerBlock)
+                    basic_transfomer_block = motion_module.transformer_blocks[i]
+
+                    motion_module.transformer_blocks[i] = FreeNoiseTransformerBlock(
+                        dim=basic_transfomer_block.dim,
+                        num_attention_heads=basic_transfomer_block.num_attention_heads,
+                        attention_head_dim=basic_transfomer_block.attention_head_dim,
+                        dropout=basic_transfomer_block.dropout,
+                        cross_attention_dim=basic_transfomer_block.cross_attention_dim,
+                        activation_fn=basic_transfomer_block.activation_fn,
+                        attention_bias=basic_transfomer_block.attention_bias,
+                        only_cross_attention=basic_transfomer_block.only_cross_attention,
+                        double_self_attention=basic_transfomer_block.double_self_attention,
+                        positional_embeddings=basic_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=basic_transfomer_block.num_positional_embeddings,
+                        context_length=self._free_noise_context_length,
+                        context_stride=self._free_noise_context_stride,
+                        weighting_scheme=self._free_noise_weighting_scheme,
+                    ).to(device=self.device, dtype=self.dtype)
+
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        basic_transfomer_block.state_dict(), strict=True
+                    )
+
+    def _disable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion]):
+        r"""Helper function to disable FreeNoise in transformer blocks."""
+
+        for motion_module in block.motion_modules:
+            motion_module: TransformerTemporalModel
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    free_noise_transfomer_block = motion_module.transformer_blocks[i]
+
+                    motion_module.transformer_blocks[i] = BasicTransformerBlock(
+                        dim=free_noise_transfomer_block.dim,
+                        num_attention_heads=free_noise_transfomer_block.num_attention_heads,
+                        attention_head_dim=free_noise_transfomer_block.attention_head_dim,
+                        dropout=free_noise_transfomer_block.dropout,
+                        cross_attention_dim=free_noise_transfomer_block.cross_attention_dim,
+                        activation_fn=free_noise_transfomer_block.activation_fn,
+                        attention_bias=free_noise_transfomer_block.attention_bias,
+                        only_cross_attention=free_noise_transfomer_block.only_cross_attention,
+                        double_self_attention=free_noise_transfomer_block.double_self_attention,
+                        positional_embeddings=free_noise_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=free_noise_transfomer_block.num_positional_embeddings,
+                    ).to(device=self.device, dtype=self.dtype)
+
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        free_noise_transfomer_block.state_dict(), strict=True
+                    )
+
+    def enable_free_noise(
+        self,
+        context_length: Optional[int] = 16,
+        context_stride: int = 4,
+        weighting_scheme: str = "pyramid",
+        shuffle: bool = True,
+    ) -> None:
+        r"""
+        Enable long video generation using FreeNoise.
+
+        Args:
+            context_length (`int`, defaults to `16`, *optional*):
+                The number of video frames to process at once. It's recommended to set this to the maximum frames the
+                Motion Adapter was trained with (usually 16/24/32). If `None`, the default value from the motion
+                adapter config is used.
+            context_stride (`int`, *optional*):
+                Long videos are generated by processing many frames. FreeNoise processes these frames in sliding
+                windows of size `context_length`. Context stride allows you to specify how many frames to skip between
+                each window. For example, a context length of 16 and context stride of 4 would process 24 frames as:
+                    [0, 15], [4, 19], [8, 23] (0-based indexing)
+            weighting_scheme (`str`, defaults to `4`):
+                TODO(aryan)
+            shuffle (`str`, defaults to `True`):
+                TODO(aryan): decide if this is even needed
+        """
+        self._free_noise_context_length = context_length or self.motion_adapter.config.motion_max_seq_length
+        self._free_noise_context_stride = context_stride
+        self._free_noise_weighting_scheme = weighting_scheme
+        self._free_noise_shuffle = shuffle
+
+        blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        for block in blocks:
+            self._enable_free_noise_in_block(block)
+
+    def disable_free_noise(self) -> None:
+        self._free_noise_context_length = None
+
+        blocks = [*self.unet.down_blocks, self.unet.mid_block, *self.unet.up_blocks]
+        for block in blocks:
+            self._disable_free_noise_in_block(block)
+
+    @property
+    def free_noise_enabled(self):
+        return hasattr(self, "_free_noise_context_length") and self._free_noise_context_length is not None
@@ -45,6 +45,7 @@ from ...utils import (
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..free_init_utils import FreeInitMixin
+from ..free_noise_utils import AnimateDiffFreeNoiseMixin
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin


@@ -131,6 +132,7 @@ class PIAPipeline(
    StableDiffusionLoraLoaderMixin,
    FromSingleFileMixin,
    FreeInitMixin,
+    AnimateDiffFreeNoiseMixin,
 ):
    r"""
    Pipeline for text-to-video generation.
@@ -407,15 +409,21 @@ class PIAPipeline(

            return image_embeds, uncond_image_embeds

-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
+    def decode_latents(self, latents, decode_batch_size: int = 16):
        latents = 1 / self.vae.config.scaling_factor * latents

        batch_size, channels, num_frames, height, width = latents.shape
        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

-        image = self.vae.decode(latents).sample
-        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        video = []
+        for i in range(0, latents.shape[0], decode_batch_size):
+            batch_latents = latents[i : i + decode_batch_size]
+            batch_latents = self.vae.decode(batch_latents).sample
+            video.append(batch_latents)
+
+        video = torch.cat(video)
+        video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        video = video.float()
        return video
@@ -687,6 +695,7 @@ class PIAPipeline(
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        decode_batch_size: int = 16,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -763,6 +772,8 @@ class PIAPipeline(
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_batch_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

@@ -931,7 +942,7 @@ class PIAPipeline(
        if output_type == "latent":
            video = latents
        else:
-            video_tensor = self.decode_latents(latents)
+            video_tensor = self.decode_latents(latents, decode_batch_size)
            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

        # 10. Offload all models
Author	SHA1	Message	Date
Aryan	5a60a62c47	add experimental support for num_frames not perfectly fitting context length, ocntext stride	2024-07-27 16:24:36 +02:00
Aryan	691facfc2e	copy animatediff controlnet implementation from #8972	2024-07-27 15:34:39 +02:00
Aryan	dc96a8d5cd	make style	2024-07-27 15:31:12 +02:00
Aryan	1b7bc007d8	make fix-copies	2024-07-27 15:30:42 +02:00
Aryan	1bb09845bf	fix copied from comments	2024-07-27 15:29:30 +02:00
Aryan	024e2da864	make style	2024-07-27 15:27:28 +02:00
Aryan	f6897ae46a	add decode batch size param to all pipelines	2024-07-27 15:26:14 +02:00
Aryan	a41f843dba	remove old helper functions	2024-07-27 15:16:45 +02:00
Aryan	10b65b310c	add freenoise	2024-07-27 15:16:02 +02:00
Aryan	610f433d1c	revert attention changes	2024-07-27 15:15:34 +02:00
Aryan	690dad693f	Merge branch 'main' into freenoise	2024-07-27 13:05:34 +02:00
Aryan	2e97ba7ccb	Merge branch 'main' into freenoise	2024-07-25 03:55:41 +05:30
Aryan	5d0f4c3407	add animatediff controlnet implementation	2024-07-24 23:54:42 +02:00
Aryan	441d321152	fix freeinit bug	2024-07-24 23:54:29 +02:00
Aryan	80e530fbfa	initial work draft for freenoise; needs massive cleanup	2024-07-24 01:38:18 +02:00