update

2024-03-01 04:49:56 +00:00
112 changed files with 852 additions and 10396 deletions
@@ -1,7 +1,6 @@
 name: Benchmarking tests

 on:
-  workflow_dispatch:
  schedule:
    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM

@@ -141,7 +141,6 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
        super().__init__(args)
        self.pipe.load_lora_weights(self.lora_id)
        self.pipe.fuse_lora()
-        self.pipe.unload_lora_weights()
        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)

    def get_result_filepath(self, args):
@@ -236,35 +235,6 @@ class InpaintingBenchmark(ImageToImageBenchmark):
        )


-class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
-    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
-    image = load_image(url)
-
-    def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
-        pipe.load_ip_adapter(
-            args.ip_adapter_id[0],
-            subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
-            weight_name=args.ip_adapter_id[1],
-        )
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            ip_adapter_image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
 class ControlNetBenchmark(TextToImageBenchmark):
    pipeline_class = StableDiffusionControlNetPipeline
    aux_network_class = ControlNetModel
@@ -1,32 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402
-
-
-IP_ADAPTER_CKPTS = {
-    "runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
-    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
-}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        choices=list(IP_ADAPTER_CKPTS.keys()),
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
-    benchmark_pipe = IPAdapterTextToImageBenchmark(args)
-    args.ckpt = f"{args.ckpt} (IP-Adapter)"
-    benchmark_pipe.benchmark(args)
@@ -72,7 +72,7 @@ def main():
                command += " --run_compile"
                run_command(command.split())

-        elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
+        elif file == "benchmark_sd_inpainting.py":
            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
            command = f"python {file} --ckpt {sdxl_ckpt}"
            run_command(command.split())
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -24,9 +24,9 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
+        torch==2.1.2 \
+        torchvision==0.16.2 \
+        torchaudio==2.1.2 \
        "onnxruntime-gpu>=1.13.1" \
        --extra-index-url https://download.pytorch.org/whl/cu117 && \
    python3 -m uv pip install --no-cache-dir \
@@ -318,8 +318,6 @@
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
-    - local: api/pipelines/stable_cascade
-      title: Stable Cascade
    - sections:
      - local: api/pipelines/stable_diffusion/overview
        title: Overview
@@ -420,8 +418,6 @@
      title: ScoreSdeVeScheduler
    - local: api/schedulers/score_sde_vp
      title: ScoreSdeVpScheduler
-    - local: api/schedulers/tcd
-      title: TCDScheduler
    - local: api/schedulers/unipc
      title: UniPCMultistepScheduler
    - local: api/schedulers/vq_diffusion
@@ -1,88 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable Cascade
-
-This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main 
-difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this 
-important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes. 
-How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being 
-encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a 
-1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the 
-highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable 
-Diffusion 1.5.
-
-Therefore, this kind of model is well suited for usages where efficiency is important. Furthermore, all known extensions
-like finetuning, LoRA, ControlNet, IP-Adapter, LCM etc. are possible with this method as well.
-
-The original codebase can be found at [Stability-AI/StableCascade](https://github.com/Stability-AI/StableCascade).
-
-## Model Overview
-Stable Cascade consists of three models: Stage A, Stage B and Stage C, representing a cascade to generate images,
-hence the name "Stable Cascade".
-
-Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion. 
-However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a 
-spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves 
-a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the 
-image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible 
-for generating the small 24 x 24 latents given a text prompt.
-
-## Uses
-
-### Direct Use
-
-The model is intended for research purposes for now. Possible research areas and tasks include
-
- Research on generative models.
- Safe deployment of models which have the potential to generate harmful content.
- Probing and understanding the limitations and biases of generative models.
- Generation of artworks and use in design and other artistic processes.
- Applications in educational or creative tools.
-
-Excluded uses are described below.
-
-### Out-of-Scope Use
-
-The model was not trained to be factual or true representations of people or events, 
-and therefore using the model to generate such content is out-of-scope for the abilities of this model.
-The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
-
-## Limitations and Bias
-
-### Limitations
- Faces and people in general may not be generated properly.
- The autoencoding part of the model is lossy.
-
-
-## StableCascadeCombinedPipeline
-
-[[autodoc]] StableCascadeCombinedPipeline
-	- all
-	- __call__
-
-## StableCascadePriorPipeline
-
-[[autodoc]] StableCascadePriorPipeline
-	- all
-	- __call__
-
-## StableCascadePriorPipelineOutput
-
-[[autodoc]] pipelines.stable_cascade.pipeline_stable_cascade_prior.StableCascadePriorPipelineOutput
-
-## StableCascadeDecoderPipeline
-
-[[autodoc]] StableCascadeDecoderPipeline
-	- all
-	- __call__
-
@@ -1,29 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# TCDScheduler 
-
-[Trajectory Consistency Distillation](https://huggingface.co/papers/2402.19159) by Jianbin Zheng, Minghui Hu, Zhongyi Fan, Chaoyue Wang, Changxing Ding, Dacheng Tao and Tat-Jen Cham introduced a Strategic Stochastic Sampling (Algorithm 4) that is capable of generating good samples in a small number of steps. Distinguishing it as an advanced iteration of the multistep scheduler (Algorithm 1) in the [Consistency Models](https://huggingface.co/papers/2303.01469), Strategic Stochastic Sampling specifically tailored for the trajectory consistency function.
-
-The abstract from the paper is:
-
-*Latent Consistency Model (LCM) extends the Consistency Model to the latent space and leverages the guided consistency distillation technique to achieve impressive performance in accelerating text-to-image synthesis. However, we observed that LCM struggles to generate images with both clarity and detailed intricacy. To address this limitation, we initially delve into and elucidate the underlying causes. Our investigation identifies that the primary issue stems from errors in three distinct areas. Consequently, we introduce Trajectory Consistency Distillation (TCD), which encompasses trajectory consistency function and strategic stochastic sampling. The trajectory consistency function diminishes the distillation errors by broadening the scope of the self-consistency boundary condition and endowing the TCD with the ability to accurately trace the entire trajectory of the Probability Flow ODE. Additionally, strategic stochastic sampling is specifically designed to circumvent the accumulated errors inherent in multi-step consistency sampling, which is meticulously tailored to complement the TCD model. Experiments demonstrate that TCD not only significantly enhances image quality at low NFEs but also yields more detailed results compared to the teacher model at high NFEs.*
-
-The original codebase can be found at [jabir-zheng/TCD](https://github.com/jabir-zheng/TCD).
-
-## TCDScheduler
-[[autodoc]] TCDScheduler
-
-
-## TCDSchedulerOutput
-[[autodoc]] schedulers.scheduling_tcd.TCDSchedulerOutput
-
@@ -77,7 +77,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -170,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s

 Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀

-Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
+Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our yown Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:

 - saved model checkpoints
 - `pytorch_lora_weights.safetensors` (the trained LoRA weights)
@@ -169,7 +169,7 @@ list_adapters_component_wise

 If you want to compile your model with `torch.compile` make sure to first fuse the LoRA weights into the base model and unload them.

-```diff
+```py
 pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
 pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")

@@ -178,16 +178,12 @@ pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
 pipe.fuse_lora()
 pipe.unload_lora_weights()

-+ pipe.unet.to(memory_format=torch.channels_last)
-+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+pipe = torch.compile(pipe)

 prompt = "toy_face of a hacker with a hoodie, pixel art"
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 ```

-> [!TIP]
-> You can refer to the `torch.compile()` section [here](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0#torchcompile) and [here](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) for more elaborate examples.
-
 ## Fusing adapters into the model

 You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
@@ -31,9 +31,9 @@ def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs):
                prompt_embeds = callback_kwargs["prompt_embeds"]
                prompt_embeds = prompt_embeds.chunk(2)[-1]

-                # update guidance_scale and prompt_embeds
-                pipe._guidance_scale = 0.0
-                callback_kwargs["prompt_embeds"] = prompt_embeds
+        # update guidance_scale and prompt_embeds
+        pipe._guidance_scale = 0.0
+        callback_kwargs["prompt_embeds"] = prompt_embeds
        return callback_kwargs
 ```

@@ -128,7 +128,7 @@ seed = 2023
 # The values come from
 # https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
 pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
-video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0]
+video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames
 export_to_video(video_frames, "astronaut_rides_horse.mp4")
 ```

@@ -80,7 +80,8 @@ To do so, just specify `--train_text_encoder_ti` while launching training (for r
 Please keep the following points in mind:

 * SDXL has two text encoders. So, we fine-tune both using LoRA.
-* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
+* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memoםהקרry.
+

 ### 3D icon example

@@ -233,32 +234,6 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.

 SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).

-### DoRA training 
-The advanced script now supports DoRA training too!
-> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), 
-**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. 
-The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. 
-
-> [!NOTE]
-> 💡DoRA training is still _experimental_  
-> and is likely to require different hyperparameter values to perform best compared to a LoRA.
-> Specifically, we've noticed 2 differences to take into account your training: 
-> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
-> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.  
-> This is also aligned with some of the quantitative analysis shown in the paper. 
-
-**Usage**
-1. To use DoRA you need to install `peft` from main: 
-```bash
-pip install git+https://github.com/huggingface/peft.git
-```
-2. Enable DoRA training by adding this flag
-```bash
--use_dora
-```
-**Inference** 
-The inference is the same as if you train a regular LoRA 🤗
-

 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
@@ -77,7 +77,6 @@ logger = get_logger(__name__)

 def save_model_card(
    repo_id: str,
-    use_dora: bool,
    images=None,
    base_model=str,
    train_text_encoder=False,
@@ -89,7 +88,6 @@ def save_model_card(
    vae_path=None,
 ):
    img_str = "widget:\n"
-    lora = "lora" if not use_dora else "dora"
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"""
@@ -141,10 +139,9 @@ to trigger concept `{key}` → use `{tokens}` in your prompt \n
 tags:
 - stable-diffusion
 - stable-diffusion-diffusers
- diffusers-training
 - text-to-image
 - diffusers
- {lora}
+- lora
 - template:sd-lora
 {img_str}
 base_model: {base_model}
@@ -654,16 +651,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument(
-        "--use_dora",
-        type=bool,
-        action="store_true",
-        default=False,
-        help=(
-            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
-            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
-        ),
-    )
    parser.add_argument(
        "--cache_latents",
        action="store_true",
@@ -1232,7 +1219,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
@@ -1244,7 +1230,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -1970,7 +1955,6 @@ def main(args):

        save_model_card(
            model_id if not args.push_to_hub else repo_id,
-            use_dora=args.use_dora,
            images=images,
            base_model=args.pretrained_model_name_or_path,
            train_text_encoder=args.train_text_encoder,
@@ -81,7 +81,6 @@ logger = get_logger(__name__)

 def save_model_card(
    repo_id: str,
-    use_dora: bool,
    images=None,
    base_model=str,
    train_text_encoder=False,
@@ -93,7 +92,6 @@ def save_model_card(
    vae_path=None,
 ):
    img_str = "widget:\n"
-    lora = "lora" if not use_dora else "dora"
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"""
@@ -146,10 +144,9 @@ to trigger concept `{key}` → use `{tokens}` in your prompt \n
 tags:
 - stable-diffusion-xl
 - stable-diffusion-xl-diffusers
- diffusers-training
 - text-to-image
 - diffusers
- {lora}
+- lora
 - template:sd-lora
 {img_str}
 base_model: {base_model}
@@ -664,15 +661,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument(
-        "--use_dora",
-        action="store_true",
-        default=False,
-        help=(
-            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
-            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
-        ),
-    )
    parser.add_argument(
        "--cache_latents",
        action="store_true",
@@ -1335,7 +1323,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
@@ -1347,7 +1334,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -2206,7 +2192,6 @@ def main(args):

        save_model_card(
            model_id if not args.push_to_hub else repo_id,
-            use_dora=args.use_dora,
            images=images,
            base_model=args.pretrained_model_name_or_path,
            train_text_encoder=args.train_text_encoder,
@@ -750,7 +750,7 @@ This example produces the following images:
 ![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)

 ### GlueGen Stable Diffusion Pipeline
-GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours.
+GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours. 

 Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main)

@@ -782,9 +782,9 @@ if __name__ == "__main__":
    ).to(device)
    pipeline.load_language_adapter("gluenet_French_clip_overnorm_over3_noln.ckpt", num_token=token_max_length, dim=1024, dim_out=768, tensor_norm=tensor_norm)

-    prompt = "une voiture sur la plage"
+    prompt = "une voiture sur la plage" 

-    generator = torch.Generator(device=device).manual_seed(42)
+    generator = torch.Generator(device=device).manual_seed(42) 
    image = pipeline(prompt, generator=generator).images[0]
    image.save("gluegen_output_fr.png")
 ```
@@ -1755,7 +1755,7 @@ with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
 ```

 The following code compares the performance of the original stable diffusion xl pipeline with the ipex-optimized pipeline.
-By using this optimized pipeline, we can get about 1.4-2 times performance boost with BFloat16 on fourth generation of Intel Xeon CPUs,
+By using this optimized pipeline, we can get about 1.4-2 times performance boost with BFloat16 on fourth generation of Intel Xeon CPUs, 
 code-named Sapphire Rapids.

 ```python
@@ -1826,7 +1826,7 @@ This approach is using (optional) CoCa model to avoid writing image description.

 This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style.

-You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline.
+You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. 

 ```python
 from diffusers import DiffusionPipeline
@@ -3397,7 +3397,7 @@ invert_prompt = "A lying cat"
 input_image = "siamese.jpg"
 steps = 50

-# Provide prompt used for generation. Same if reconstruction
+# Provide prompt used for generation. Same if reconstruction 
 prompt = "A lying cat"
 # or different if editing.
 prompt = "A lying dog"
@@ -3493,7 +3493,7 @@ output_frames = pipe(
    mask_end=0.8,
    mask_strength=0.5,
    negative_prompt='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-).frames[0]
+).frames

 export_to_video(
    output_frames, "/path/to/video.mp4", 5)
@@ -3636,8 +3636,8 @@ image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
 images = pipeline(
    prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
    image_embeds=image,
-    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
-    num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704, 
    generator=generator
 ).images

@@ -15,46 +15,17 @@
 from __future__ import annotations

 import abc
-import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import numpy as np
 import torch
 import torch.nn.functional as F
-from packaging import version
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)

-from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
-from diffusers.configuration_utils import FrozenDict, deprecate
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import (
-    FromSingleFileMixin,
-    IPAdapterMixin,
-    LoraLoaderMixin,
-    TextualInversionLoaderMixin,
-)
 from diffusers.models.attention import Attention
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipeline,
+    StableDiffusionPipelineOutput,
 )
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import (
-    USE_PEFT_BACKEND,
-    logging,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
-from diffusers.utils.torch_utils import randn_tensor
-
-
-logger = logging.get_logger(__name__)


 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
@@ -72,486 +43,34 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
    return noise_cfg


-class Prompt2PromptPipeline(
-    DiffusionPipeline,
-    TextualInversionLoaderMixin,
-    LoraLoaderMixin,
-    IPAdapterMixin,
-    FromSingleFileMixin,
-):
+class Prompt2PromptPipeline(StableDiffusionPipeline):
    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
    Args:
+    Prompt-to-Prompt-Pipeline for text-to-image generation using Stable Diffusion. This model inherits from
+    [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for
+    all the pipelines (such as downloading or saving, running on a particular device, etc.)
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler
+        ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

-    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
-    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
    _optional_components = ["safety_checker", "feature_extractor"]

-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        **kwargs,
-    ):
-        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
-        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-
-        prompt_embeds_tuple = self.encode_prompt(
-            prompt=prompt,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=lora_scale,
-            **kwargs,
-        )
-
-        # concatenate for backwards comp
-        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: process multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device),
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: process multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if torch.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        return image, has_nsfw_concept
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        ip_adapter_image=None,
-        ip_adapter_image_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
-            raise ValueError(
-                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
-            )
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
    @torch.no_grad()
    def __call__(
        self,
@@ -242,7 +242,6 @@ These are controlnet weights trained on {base_model} with new type of conditioni
        "text-to-image",
        "diffusers",
        "controlnet",
-        "diffusers-training",
    ]
    model_card = populate_model_card(model_card, tags=tags)

@@ -169,7 +169,6 @@ These are controlnet weights trained on {base_model} with new type of conditioni
        "diffusers",
        "controlnet",
        "jax-diffusers-event",
-        "diffusers-training",
    ]
    model_card = populate_model_card(model_card, tags=tags)

@@ -243,7 +243,6 @@ These are controlnet weights trained on {base_model} with new type of conditioni
        "text-to-image",
        "diffusers",
        "controlnet",
-        "diffusers-training",
    ]
    model_card = populate_model_card(model_card, tags=tags)

@@ -97,14 +97,7 @@ These are Custom Diffusion adaption weights for {base_model}. The weights were t
        inference=True,
    )

-    tags = [
-        "text-to-image",
-        "diffusers",
-        "stable-diffusion",
-        "stable-diffusion-diffusers",
-        "custom-diffusion",
-        "diffusers-training",
-    ]
+    tags = ["text-to-image", "diffusers", "stable-diffusion", "stable-diffusion-diffusers", "custom-diffusion"]
    model_card = populate_model_card(model_card, tags=tags)

    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -206,66 +206,3 @@ You can explore the results from a couple of our internal experiments by checkin
 ## Running on a free-tier Colab Notebook

 Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb). 
-
-## Conducting EDM-style training
-
-It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364). 
-
-For the SDXL model, simple set:
-
-```diff
-+  --do_edm_style_training \
-```
-
-Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
-
-```bash
-accelerate launch train_dreambooth_lora_sdxl.py \
-  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
-  --instance_data_dir="dog" \
-  --output_dir="dog-playground-lora" \
-  --mixed_precision="fp16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --learning_rate=1e-4 \
-  --use_8bit_adam \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed="0" \
-  --push_to_hub
-```
-
-> [!CAUTION]
-> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
-
-### DoRA training 
-The script now supports DoRA training too!
-> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), 
-**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. 
-The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. 
-
-> [!NOTE]
-> 💡DoRA training is still _experimental_  
-> and is likely to require different hyperparameter values to perform best compared to a LoRA.
-> Specifically, we've noticed 2 differences to take into account your training: 
-> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
-> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.  
-> This is also aligned with some of the quantitative analysis shown in the paper. 
-
-**Usage**
-1. To use DoRA you need to install `peft` from main: 
-```bash
-pip install git+https://github.com/huggingface/peft.git
-```
-2. Enable DoRA training by adding this flag
-```bash
--use_dora
-```
-**Inference** 
-The inference is the same as if you train a regular LoRA 🤗
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-import safetensors
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class DreamBoothLoRASDXLWithEDM(ExamplesTestsAccelerate):
-    def test_dreambooth_lora_sdxl_with_edm(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --do_edm_style_training
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_playground(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-playground-v2-5-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
@@ -102,7 +102,7 @@ DreamBooth for the text encoder was enabled: {train_text_encoder}.
        inference=True,
    )

-    tags = ["text-to-image", "dreambooth", "diffusers-training"]
+    tags = ["text-to-image", "dreambooth"]
    if isinstance(pipeline, StableDiffusionPipeline):
        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
    else:
@@ -106,7 +106,7 @@ LoRA for the text encoder was enabled: {train_text_encoder}.
        model_description=model_description,
        inference=True,
    )
-    tags = ["text-to-image", "diffusers", "lora", "diffusers-training"]
+    tags = ["text-to-image", "diffusers", "lora"]
    if isinstance(pipeline, StableDiffusionPipeline):
        tags.extend(["stable-diffusion", "stable-diffusion-diffusers"])
    else:
@@ -14,10 +14,8 @@
 # See the License for the specific language governing permissions and

 import argparse
-import contextlib
 import gc
 import itertools
-import json
 import logging
 import math
 import os
@@ -34,7 +32,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
@@ -52,8 +50,6 @@ from diffusers import (
    AutoencoderKL,
    DDPMScheduler,
    DPMSolverMultistepScheduler,
-    EDMEulerScheduler,
-    EulerDiscreteScheduler,
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
 )
@@ -80,23 +76,8 @@ check_min_version("0.27.0.dev0")
 logger = get_logger(__name__)


-def determine_scheduler_type(pretrained_model_name_or_path, revision):
-    model_index_filename = "model_index.json"
-    if os.path.isdir(pretrained_model_name_or_path):
-        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
-    else:
-        model_index = hf_hub_download(
-            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
-        )
-
-    with open(model_index, "r") as f:
-        scheduler_type = json.load(f)["scheduler"][1]
-    return scheduler_type
-
-
 def save_model_card(
    repo_id: str,
-    use_dora: bool,
    images=None,
    base_model: str = None,
    train_text_encoder=False,
@@ -114,7 +95,7 @@ def save_model_card(
            )

    model_description = f"""
-# {'SDXL' if 'playgroundai' not in base_model else 'Playground'} LoRA DreamBooth - {repo_id}
+# SDXL LoRA DreamBooth - {repo_id}

 <Gallery />

@@ -138,17 +119,11 @@ Weights for this model are available in Safetensors format.

 [Download]({repo_id}/tree/main) them in the Files & versions tab.

-"""
-    if "playgroundai" in args.pretrained_model_name_or_path:
-        model_description += """\n
-## License
-
-Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md).
 """
    model_card = load_or_create_model_card(
        repo_id_or_path=repo_id,
        from_training=True,
-        license="openrail++" if "playgroundai" not in base_model else "playground-v2dot5-community",
+        license="openrail++",
        base_model=base_model,
        prompt=instance_prompt,
        model_description=model_description,
@@ -156,18 +131,15 @@ Please adhere to the licensing terms as described [here](https://huggingface.co/
    )
    tags = [
        "text-to-image",
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
        "text-to-image",
-        "diffusers-training",
        "diffusers",
-        "lora" if not use_dora else "dora",
+        "lora",
        "template:sd-lora",
    ]
-    if "playgroundai" in base_model:
-        tags.extend(["playground", "playground-diffusers"])
-    else:
-        tags.extend(["stable-diffusion-xl", "stable-diffusion-xl-diffusers"])
-
    model_card = populate_model_card(model_card, tags=tags)
+
    model_card.save(os.path.join(repo_folder, "README.md"))


@@ -187,29 +159,23 @@ def log_validation(
    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
    scheduler_args = {}

-    if not args.do_edm_style_training:
-        if "variance_type" in pipeline.scheduler.config:
-            variance_type = pipeline.scheduler.config.variance_type
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type

-            if variance_type in ["learned", "learned_range"]:
-                variance_type = "fixed_small"
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"

-            scheduler_args["variance_type"] = variance_type
+        scheduler_args["variance_type"] = variance_type

-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)

    pipeline = pipeline.to(accelerator.device)
    pipeline.set_progress_bar_config(disable=True)

    # run inference
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
-    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
-    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-    inference_ctx = (
-        contextlib.nullcontext() if "playgroundai" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
-    )

-    with inference_ctx:
+    with torch.cuda.amp.autocast():
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]

    for tracker in accelerator.trackers:
@@ -368,12 +334,6 @@ def parse_args(input_args=None):
            " `args.validation_prompt` multiple times: `args.num_validation_images`."
        ),
    )
-    parser.add_argument(
-        "--do_edm_style_training",
-        default=False,
-        action="store_true",
-        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
-    )
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -647,15 +607,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument(
-        "--use_dora",
-        action="store_true",
-        default=False,
-        help=(
-            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
-            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
-        ),
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -877,8 +828,6 @@ def collate_fn(examples, with_prior_preservation=False):
    if with_prior_preservation:
        pixel_values += [example["class_images"] for example in examples]
        prompts += [example["class_prompt"] for example in examples]
-        original_sizes += [example["original_size"] for example in examples]
-        crop_top_lefts += [example["crop_top_left"] for example in examples]

    pixel_values = torch.stack(pixel_values)
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
@@ -956,9 +905,6 @@ def main(args):
            " Please use `huggingface-cli login` to authenticate with the Hub."
        )

-    if args.do_edm_style_training and args.snr_gamma is not None:
-        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
-
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
@@ -1072,19 +1018,7 @@ def main(args):
    )

    # Load scheduler and models
-    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
-    if "EDM" in scheduler_type:
-        args.do_edm_style_training = True
-        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-        logger.info("Performing EDM-style training!")
-    elif args.do_edm_style_training:
-        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
-            args.pretrained_model_name_or_path, subfolder="scheduler"
-        )
-        logger.info("Performing EDM-style training!")
-    else:
-        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
    )
@@ -1102,12 +1036,6 @@ def main(args):
        revision=args.revision,
        variant=args.variant,
    )
-    latents_mean = latents_std = None
-    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
-        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
-    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
-        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
-
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
    )
@@ -1158,7 +1086,6 @@ def main(args):
    # now we will add new LoRA weights to the attention layers
    unet_lora_config = LoraConfig(
        r=args.rank,
-        use_dora=args.use_dora,
        lora_alpha=args.rank,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@@ -1170,7 +1097,6 @@ def main(args):
    if args.train_text_encoder:
        text_lora_config = LoraConfig(
            r=args.rank,
-            use_dora=args.use_dora,
            lora_alpha=args.rank,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
@@ -1252,7 +1178,7 @@ def main(args):
            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)

            _set_state_dict_into_text_encoder(
-                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_one_
            )

        # Make sure the trainable params are in float32. This is again needed since the base models
@@ -1507,12 +1433,7 @@ def main(args):
    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
-        tracker_name = (
-            "dreambooth-lora-sd-xl"
-            if "playgroundai" not in args.pretrained_model_name_or_path
-            else "dreambooth-lora-playground"
-        )
-        accelerator.init_trackers(tracker_name, config=vars(args))
+        accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))

    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1564,18 +1485,6 @@ def main(args):
        disable=not accelerator.is_local_main_process,
    )

-    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
-        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
-        timesteps = timesteps.to(accelerator.device)
-
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < n_dim:
-            sigma = sigma.unsqueeze(-1)
-        return sigma
-
    for epoch in range(first_epoch, args.num_train_epochs):
        unet.train()
        if args.train_text_encoder:
@@ -1603,46 +1512,22 @@ def main(args):

                # Convert images to latent space
                model_input = vae.encode(pixel_values).latent_dist.sample()
-
-                if latents_mean is None and latents_std is None:
-                    model_input = model_input * vae.config.scaling_factor
-                    if args.pretrained_vae_model_name_or_path is None:
-                        model_input = model_input.to(weight_dtype)
-                else:
-                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
-                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
-                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
-                    model_input = model_input.to(dtype=weight_dtype)
+                model_input = model_input * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(model_input)
                bsz = model_input.shape[0]
-
                # Sample a random timestep for each image
-                if not args.do_edm_style_training:
-                    timesteps = torch.randint(
-                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                    )
-                    timesteps = timesteps.long()
-                else:
-                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
-                    # instead of discrete timesteps, so here we sample indices to get the noise levels
-                    # from `scheduler.timesteps`
-                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
-                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
-                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
-                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
-                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
-                if args.do_edm_style_training:
-                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
-                    if "EDM" in scheduler_type:
-                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
-                    else:
-                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)

                # time ids
                add_time_ids = torch.cat(
@@ -1666,7 +1551,7 @@ def main(args):
                    }
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
@@ -1685,43 +1570,18 @@ def main(args):
                    )
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
                        return_dict=False,
                    )[0]

-                weighting = None
-                if args.do_edm_style_training:
-                    # Similar to the input preconditioning, the model predictions are also preconditioned
-                    # on noised model inputs (before preconditioning) and the sigmas.
-                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
-                    if "EDM" in scheduler_type:
-                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
-                    else:
-                        if noise_scheduler.config.prediction_type == "epsilon":
-                            model_pred = model_pred * (-sigmas) + noisy_model_input
-                        elif noise_scheduler.config.prediction_type == "v_prediction":
-                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
-                                noisy_model_input / (sigmas**2 + 1)
-                            )
-                    # We are not doing weighting here because it tends result in numerical problems.
-                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-                    # There might be other alternatives for weighting as well:
-                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
-                    if "EDM" not in scheduler_type:
-                        weighting = (sigmas**-2.0).float()
-
                # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = model_input if args.do_edm_style_training else noise
+                    target = noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = (
-                        model_input
-                        if args.do_edm_style_training
-                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
-                    )
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

@@ -1731,28 +1591,10 @@ def main(args):
                    target, target_prior = torch.chunk(target, 2, dim=0)

                    # Compute prior loss
-                    if weighting is not None:
-                        prior_loss = torch.mean(
-                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
-                                target_prior.shape[0], -1
-                            ),
-                            1,
-                        )
-                        prior_loss = prior_loss.mean()
-                    else:
-                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")

                if args.snr_gamma is None:
-                    if weighting is not None:
-                        loss = torch.mean(
-                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
-                                target.shape[0], -1
-                            ),
-                            1,
-                        )
-                        loss = loss.mean()
-                    else:
-                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                else:
                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
@@ -1854,6 +1696,7 @@ def main(args):
                    variant=args.variant,
                    torch_dtype=weight_dtype,
                )
+
                pipeline_args = {"prompt": args.validation_prompt}

                images = log_validation(
@@ -1927,7 +1770,6 @@ def main(args):
        if args.push_to_hub:
            save_model_card(
                repo_id,
-                use_dora=args.use_dora,
                images=images,
                base_model=args.pretrained_model_name_or_path,
                train_text_encoder=args.train_text_encoder,
@@ -81,7 +81,6 @@ tags:
 - kandinsky
 - text-to-image
 - diffusers
- diffusers-training
 inference: true
 ---
    """
@@ -65,7 +65,6 @@ tags:
 - kandinsky
 - text-to-image
 - diffusers
- diffusers-training
 - lora
 inference: true
 ---
@@ -65,7 +65,6 @@ tags:
 - kandinsky
 - text-to-image
 - diffusers
- diffusers-training
 - lora
 inference: true
 ---
@@ -82,7 +82,6 @@ tags:
 - kandinsky
 - text-to-image
 - diffusers
- diffusers-training
 inference: true
 ---
    """
@@ -460,8 +460,6 @@ tags:
 - text-to-image
 - diffusers
 - controlnet
- diffusers-training
- webdataset
 inference: true
 ---
    """
@@ -61,34 +61,6 @@ accelerate launch train_diffusion_dpo_sdxl.py \
  --push_to_hub
 ```

-## SDXL Turbo training command
-
-```bash
-accelerate launch train_diffusion_dpo_sdxl.py \
-  --pretrained_model_name_or_path=stabilityai/sdxl-turbo \
-  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
-  --output_dir="diffusion-sdxl-turbo-dpo" \
-  --mixed_precision="fp16" \
-  --dataset_name=kashif/pickascore \
-  --train_batch_size=8 \
-  --gradient_accumulation_steps=2 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --rank=8 \
-  --learning_rate=1e-5 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=2000 \
-  --checkpointing_steps=500 \
-  --run_validation --validation_steps=50 \
-  --seed="0" \
-  --report_to="wandb" \
-  --is_turbo --resolution 512 \
-  --push_to_hub
-```
-
-
 ## Acknowledgements

 This is based on the amazing work done by [Bram](https://github.com/bram-w) here for Diffusion DPO: https://github.com/bram-w/trl/blob/dpo/. 
@@ -118,16 +118,9 @@ def log_validation(args, unet, vae, accelerator, weight_dtype, epoch, is_final_v
    images = []
    context = contextlib.nullcontext() if is_final_validation else torch.cuda.amp.autocast()

-    guidance_scale = 5.0
-    num_inference_steps = 25
-    if args.is_turbo:
-        guidance_scale = 0.0
-        num_inference_steps = 4
    for prompt in VALIDATION_PROMPTS:
        with context:
-            image = pipeline(
-                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
-            ).images[0]
+            image = pipeline(prompt, num_inference_steps=25, generator=generator).images[0]
            images.append(image)

    tracker_key = "test" if is_final_validation else "validation"
@@ -148,10 +141,7 @@ def log_validation(args, unet, vae, accelerator, weight_dtype, epoch, is_final_v
    if is_final_validation:
        pipeline.disable_lora()
        no_lora_images = [
-            pipeline(
-                prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
-            ).images[0]
-            for prompt in VALIDATION_PROMPTS
+            pipeline(prompt, num_inference_steps=25, generator=generator).images[0] for prompt in VALIDATION_PROMPTS
        ]

        for tracker in accelerator.trackers:
@@ -433,11 +423,6 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
-    parser.add_argument(
-        "--is_turbo",
-        action="store_true",
-        help=("Use if tuning SDXL Turbo instead of SDXL"),
-    )
    parser.add_argument(
        "--rank",
        type=int,
@@ -459,9 +444,6 @@ def parse_args(input_args=None):
    if args.dataset_name is None:
        raise ValueError("Must provide a `dataset_name`.")

-    if args.is_turbo:
-        assert "turbo" in args.pretrained_model_name_or_path
-
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank
@@ -578,36 +560,6 @@ def main(args):

    # Load scheduler and models
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    def enforce_zero_terminal_snr(scheduler):
-        # Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddpm.py#L93
-        # Original implementation https://arxiv.org/pdf/2305.08891.pdf
-        # Turbo needs zero terminal SNR
-        # Turbo: https://static1.squarespace.com/static/6213c340453c3f502425776e/t/65663480a92fba51d0e1023f/1701197769659/adversarial_diffusion_distillation.pdf
-        # Convert betas to alphas_bar_sqrt
-        alphas = 1 - scheduler.betas
-        alphas_bar = alphas.cumprod(0)
-        alphas_bar_sqrt = alphas_bar.sqrt()
-
-        # Store old values.
-        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
-        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
-        # Shift so last timestep is zero.
-        alphas_bar_sqrt -= alphas_bar_sqrt_T
-        # Scale so first timestep is back to old value.
-        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
-
-        alphas_bar = alphas_bar_sqrt**2
-        alphas = alphas_bar[1:] / alphas_bar[:-1]
-        alphas = torch.cat([alphas_bar[0:1], alphas])
-
-        alphas_cumprod = torch.cumprod(alphas, dim=0)
-        scheduler.alphas_cumprod = alphas_cumprod
-        return
-
-    if args.is_turbo:
-        enforce_zero_terminal_snr(noise_scheduler)
-
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
    )
@@ -957,10 +909,6 @@ def main(args):
                timesteps = torch.randint(
                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device, dtype=torch.long
                ).repeat(2)
-                if args.is_turbo:
-                    # Learn a 4 timestep schedule
-                    timesteps_0_to_3 = timesteps % 4
-                    timesteps = 250 * timesteps_0_to_3 + 249

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
@@ -69,7 +69,6 @@ tags:
 - stable-diffusion-diffusers
 - text-to-image
 - diffusers
- diffusers-training
 - lora
 inference: true
 ---
@@ -100,8 +100,6 @@ tags:
 - text-to-image
 - diffusers
 - textual_inversion
- diffusers-training
- onxruntime
 inference: true
 ---
    """
@@ -1,50 +0,0 @@
-# PromptDiffusion Pipeline
-
-From the project [page](https://zhendong-wang.github.io/prompt-diffusion.github.io/)
-
-"With a prompt consisting of a task-specific example pair of images and text guidance, and a new query image, Prompt Diffusion can comprehend the desired task and generate the corresponding output image on both seen (trained) and unseen (new) task types."
-
-For any usage questions, please refer to the [paper](https://arxiv.org/abs/2305.01115).
-
-Prepare models by converting them from the [checkpoint](https://huggingface.co/zhendongw/prompt-diffusion)
-
-To convert the controlnet, use cldm_v15.yaml from the [repository](https://github.com/Zhendong-Wang/Prompt-Diffusion/tree/main/models/):
-
-```bash
-python convert_original_promptdiffusion_to_diffusers.py --checkpoint_path path-to-network-step04999.ckpt --original_config_file path-to-cldm_v15.yaml --dump_path path-to-output-directory
-```
-
-To learn about how to convert the fine-tuned stable diffusion model, see the [Load different Stable Diffusion formats guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/other-formats).
-
-
-```py
-import torch
-from diffusers import UniPCMultistepScheduler
-from diffusers.utils import load_image
-from promptdiffusioncontrolnet import PromptDiffusionControlNetModel
-from pipeline_prompt_diffusion import PromptDiffusionPipeline
-
-
-from PIL import ImageOps
-
-image_a = ImageOps.invert(load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house_line.png?raw=true"))
-
-image_b = load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/house.png?raw=true")
-query = ImageOps.invert(load_image("https://github.com/Zhendong-Wang/Prompt-Diffusion/blob/main/images_to_try/new_01.png?raw=true"))
-
-# load prompt diffusion controlnet and prompt diffusion
-
-controlnet = PromptDiffusionControlNetModel.from_pretrained("iczaw/prompt-diffusion-diffusers", subfolder="controlnet", torch_dtype=torch.float16)
-model_id = "path-to-model"
-pipe = PromptDiffusionPipeline.from_pretrained("iczaw/prompt-diffusion-diffusers", subfolder="base", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16")
-
-# speed up diffusion process with faster scheduler and memory optimization
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-# remove following line if xformers is not installed
-pipe.enable_xformers_memory_efficient_attention()
-pipe.enable_model_cpu_offload()
-# generate image
-generator = torch.manual_seed(0)
-image = pipe("a tortoise", num_inference_steps=20, generator=generator, image_pair=[image_a,image_b], image=query).images[0]
-
-```
@@ -1,385 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, Optional, Tuple, Union
-
-import torch
-
-from diffusers.configuration_utils import register_to_config
-from diffusers.models.controlnet import (
-    ControlNetConditioningEmbedding,
-    ControlNetModel,
-    ControlNetOutput,
-)
-from diffusers.utils import logging
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class PromptDiffusionControlNetModel(ControlNetModel):
-    """
-    A PromptDiffusionControlNet model.
-
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
-            The dimension of the attention heads.
-        use_linear_projection (`bool`, defaults to `False`):
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        num_class_embeds (`int`, *optional*, defaults to 0):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
-            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
-            `class_embed_type="projection"`.
-        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-            TODO(Patrick) - unused parameter.
-        addition_embed_type_num_heads (`int`, defaults to 64):
-            The number of heads to use for the `TextTimeEmbedding` layer.
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 3,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str, ...] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        addition_time_embed_dim: Optional[int] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        addition_embed_type_num_heads: int = 64,
-    ):
-        super().__init__(
-            in_channels,
-            conditioning_channels,
-            flip_sin_to_cos,
-            freq_shift,
-            down_block_types,
-            mid_block_type,
-            only_cross_attention,
-            block_out_channels,
-            layers_per_block,
-            downsample_padding,
-            mid_block_scale_factor,
-            act_fn,
-            norm_num_groups,
-            norm_eps,
-            cross_attention_dim,
-            transformer_layers_per_block,
-            encoder_hid_dim,
-            encoder_hid_dim_type,
-            attention_head_dim,
-            num_attention_heads,
-            use_linear_projection,
-            class_embed_type,
-            addition_embed_type,
-            addition_time_embed_dim,
-            num_class_embeds,
-            upcast_attention,
-            resnet_time_scale_shift,
-            projection_class_embeddings_input_dim,
-            controlnet_conditioning_channel_order,
-            conditioning_embedding_out_channels,
-            global_pool_conditions,
-            addition_embed_type_num_heads,
-        )
-        self.controlnet_query_cond_embedding = ControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels,
-            conditioning_channels=3,
-        )
-
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        controlnet_cond: torch.FloatTensor,
-        controlnet_query_cond: torch.FloatTensor,
-        conditioning_scale: float = 1.0,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
-        """
-        The [`~PromptDiffusionControlNetModel`] forward method.
-
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            controlnet_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            controlnet_query_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for ControlNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
-                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
-                embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            added_cond_kwargs (`dict`):
-                Additional conditions for the Stable Diffusion XL UNet.
-            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        # check channel order
-        channel_order = self.config.controlnet_conditioning_channel_order
-
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-        aug_emb = None
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
-
-        if self.config.addition_embed_type is not None:
-            if self.config.addition_embed_type == "text":
-                aug_emb = self.add_embedding(encoder_hidden_states)
-
-            elif self.config.addition_embed_type == "text_time":
-                if "text_embeds" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                    )
-                text_embeds = added_cond_kwargs.get("text_embeds")
-                if "time_ids" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                    )
-                time_ids = added_cond_kwargs.get("time_ids")
-                time_embeds = self.add_time_proj(time_ids.flatten())
-                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-
-                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-                add_embeds = add_embeds.to(emb.dtype)
-                aug_emb = self.add_embedding(add_embeds)
-
-        emb = emb + aug_emb if aug_emb is not None else emb
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-        controlnet_query_cond = self.controlnet_query_cond_embedding(controlnet_query_cond)
-        sample = sample + controlnet_cond + controlnet_query_cond
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-
-        # 5. Control net blocks
-
-        controlnet_down_block_res_samples = ()
-
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
-
-        down_block_res_samples = controlnet_down_block_res_samples
-
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
-            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
-        else:
-            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-            mid_block_res_sample = mid_block_res_sample * conditioning_scale
-
-        if self.config.global_pool_conditions:
-            down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
-            ]
-            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
-
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-
-        return ControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )
@@ -87,7 +87,6 @@ tags:
 - text-to-image
 - diffusers
 - realfill
- diffusers-training
 inference: true
 ---
    """
@@ -225,14 +225,7 @@ These are t2iadapter weights trained on {base_model} with new type of conditioni
        inference=True,
    )

-    tags = [
-        "stable-diffusion-xl",
-        "stable-diffusion-xl-diffusers",
-        "text-to-image",
-        "diffusers",
-        "t2iadapter",
-        "diffusers-training",
-    ]
+    tags = ["stable-diffusion-xl", "stable-diffusion-xl-diffusers", "text-to-image", "diffusers", "t2iadapter"]
    model_card = populate_model_card(model_card, tags=tags)

    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -131,7 +131,7 @@ More information on all the CLI arguments and the environment are available on y
        inference=True,
    )

-    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "diffusers-training"]
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers"]
    model_card = populate_model_card(model_card, tags=tags)

    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -90,7 +90,6 @@ These are LoRA adaption weights for {base_model}. The weights were fine-tuned on
        "stable-diffusion-diffusers",
        "text-to-image",
        "diffusers",
-        "diffusers-training",
        "lora",
    ]
    model_card = populate_model_card(model_card, tags=tags)
@@ -103,14 +103,7 @@ Special VAE used for training: {vae_path}.
        inference=True,
    )

-    tags = [
-        "stable-diffusion-xl",
-        "stable-diffusion-xl-diffusers",
-        "text-to-image",
-        "diffusers",
-        "diffusers-training",
-        "lora",
-    ]
+    tags = ["stable-diffusion-xl", "stable-diffusion-xl-diffusers", "text-to-image", "diffusers", "lora"]
    model_card = populate_model_card(model_card, tags=tags)

    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -35,7 +35,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
-from datasets import concatenate_datasets, load_dataset
+from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from torchvision import transforms
@@ -101,7 +101,6 @@ Special VAE used for training: {vae_path}.
        "stable-diffusion-xl",
        "stable-diffusion-xl-diffusers",
        "text-to-image",
-        "diffusers-training",
        "diffusers",
    ]
    model_card = populate_model_card(model_card, tags=tags)
@@ -896,20 +895,14 @@ def main(args):
        # fingerprint used by the cache for the other processes to load the result
        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
        new_fingerprint = Hasher.hash(args)
-        new_fingerprint_for_vae = Hasher.hash(vae_path)
-        train_dataset_with_embeddings = train_dataset.map(
-            compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint
-        )
-        train_dataset_with_vae = train_dataset.map(
+        new_fingerprint_for_vae = Hasher.hash("vae")
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+        train_dataset = train_dataset.map(
            compute_vae_encodings_fn,
            batched=True,
            batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps,
            new_fingerprint=new_fingerprint_for_vae,
        )
-        precomputed_dataset = concatenate_datasets(
-            [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1
-        )
-        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)

    del text_encoders, tokenizers, vae
    gc.collect()
@@ -932,7 +925,7 @@ def main(args):

    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
-        precomputed_dataset,
+        train_dataset,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=args.train_batch_size,
@@ -983,7 +976,7 @@ def main(args):
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(precomputed_dataset)}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
@@ -105,14 +105,7 @@ These are textual inversion adaption weights for {base_model}. You can find some
        inference=True,
    )

-    tags = [
-        "stable-diffusion",
-        "stable-diffusion-diffusers",
-        "text-to-image",
-        "diffusers",
-        "textual_inversion",
-        "diffusers-training",
-    ]
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "textual_inversion"]
    model_card = populate_model_card(model_card, tags=tags)

    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -106,7 +106,6 @@ These are textual inversion adaption weights for {base_model}. You can find some
        "stable-diffusion-xl-diffusers",
        "text-to-image",
        "diffusers",
-        "diffusers-training",
        "textual_inversion",
    ]

@@ -2,6 +2,7 @@ accelerate>=0.16.0
 torchvision
 transformers>=4.25.1
 wandb
+huggingface-cli
 bitsandbytes
 deepspeed
 peft>=0.6.0
@@ -81,7 +81,6 @@ tags:
 - wuerstchen
 - text-to-image
 - diffusers
- diffusers-training
 - lora
 inference: true
 ---
@@ -82,7 +82,6 @@ tags:
 - wuerstchen
 - text-to-image
 - diffusers
- diffusers-training
 inference: true
 ---
    """
@@ -9,11 +9,11 @@ from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtAlphaPip

 ckpt_id = "PixArt-alpha/PixArt-alpha"
 # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125
-interpolation_scale = {256: 0.5, 512: 1, 1024: 2}
+interpolation_scale = {512: 1, 1024: 2}


 def main(args):
-    all_state_dict = torch.load(args.orig_ckpt_path, map_location="cpu")
+    all_state_dict = torch.load(args.orig_ckpt_path)
    state_dict = all_state_dict.pop("state_dict")
    converted_state_dict = {}

@@ -22,6 +22,7 @@ def main(args):
    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")

    # Caption projection.
+    converted_state_dict["caption_projection.y_embedding"] = state_dict.pop("y_embedder.y_embedding")
    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
@@ -154,7 +155,6 @@ def main(args):

    assert transformer.pos_embed.pos_embed is not None
    state_dict.pop("pos_embed")
-    state_dict.pop("y_embedder.y_embedding")
    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"

    num_model_params = sum(p.numel() for p in transformer.parameters())
@@ -187,7 +187,7 @@ if __name__ == "__main__":
        "--image_size",
        default=1024,
        type=int,
-        choices=[256, 512, 1024],
+        choices=[512, 1024],
        required=False,
        help="Image size of pretrained model, either 512 or 1024.",
    )
@@ -1,215 +0,0 @@
-# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
-import argparse
-
-import accelerate
-import torch
-from safetensors.torch import load_file
-from transformers import (
-    AutoTokenizer,
-    CLIPConfig,
-    CLIPImageProcessor,
-    CLIPTextModelWithProjection,
-    CLIPVisionModelWithProjection,
-)
-
-from diffusers import (
-    DDPMWuerstchenScheduler,
-    StableCascadeCombinedPipeline,
-    StableCascadeDecoderPipeline,
-    StableCascadePriorPipeline,
-)
-from diffusers.models import StableCascadeUNet
-from diffusers.models.modeling_utils import load_model_dict_into_meta
-from diffusers.pipelines.wuerstchen import PaellaVQModel
-
-
-parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
-parser.add_argument("--model_path", type=str, default="../StableCascade", help="Location of Stable Cascade weights")
-parser.add_argument("--stage_c_name", type=str, default="stage_c.safetensors", help="Name of stage c checkpoint file")
-parser.add_argument("--stage_b_name", type=str, default="stage_b.safetensors", help="Name of stage b checkpoint file")
-parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
-parser.add_argument("--save_org", type=str, default="diffusers", help="Hub organization to save the pipelines to")
-parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
-
-args = parser.parse_args()
-model_path = args.model_path
-
-device = "cpu"
-
-# set paths to model weights
-prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
-decoder_checkpoint_path = f"{model_path}/{args.stage_b_name}"
-
-# Clip Text encoder and tokenizer
-config = CLIPConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
-config.text_config.projection_dim = config.projection_dim
-text_encoder = CLIPTextModelWithProjection.from_pretrained(
-    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", config=config.text_config
-)
-tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
-
-# image processor
-feature_extractor = CLIPImageProcessor()
-image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-
-# Prior
-if args.use_safetensors:
-    orig_state_dict = load_file(prior_checkpoint_path, device=device)
-else:
-    orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
-
-state_dict = {}
-for key in orig_state_dict.keys():
-    if key.endswith("in_proj_weight"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
-        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
-        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
-    elif key.endswith("in_proj_bias"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
-        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
-        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
-    elif key.endswith("out_proj.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
-    elif key.endswith("out_proj.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
-    else:
-        state_dict[key] = orig_state_dict[key]
-
-
-with accelerate.init_empty_weights():
-    prior_model = StableCascadeUNet(
-        in_channels=16,
-        out_channels=16,
-        timestep_ratio_embedding_dim=64,
-        patch_size=1,
-        conditioning_dim=2048,
-        block_out_channels=[2048, 2048],
-        num_attention_heads=[32, 32],
-        down_num_layers_per_block=[8, 24],
-        up_num_layers_per_block=[24, 8],
-        down_blocks_repeat_mappers=[1, 1],
-        up_blocks_repeat_mappers=[1, 1],
-        block_types_per_layer=[
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-        ],
-        clip_text_in_channels=1280,
-        clip_text_pooled_in_channels=1280,
-        clip_image_in_channels=768,
-        clip_seq=4,
-        kernel_size=3,
-        dropout=[0.1, 0.1],
-        self_attn=True,
-        timestep_conditioning_type=["sca", "crp"],
-        switch_level=[False],
-    )
-load_model_dict_into_meta(prior_model, state_dict)
-
-# scheduler for prior and decoder
-scheduler = DDPMWuerstchenScheduler()
-
-# Prior pipeline
-prior_pipeline = StableCascadePriorPipeline(
-    prior=prior_model,
-    tokenizer=tokenizer,
-    text_encoder=text_encoder,
-    image_encoder=image_encoder,
-    scheduler=scheduler,
-    feature_extractor=feature_extractor,
-)
-prior_pipeline.save_pretrained(f"{args.save_org}/StableCascade-prior", push_to_hub=args.push_to_hub)
-
-# Decoder
-if args.use_safetensors:
-    orig_state_dict = load_file(decoder_checkpoint_path, device=device)
-else:
-    orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
-
-state_dict = {}
-for key in orig_state_dict.keys():
-    if key.endswith("in_proj_weight"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
-        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
-        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
-    elif key.endswith("in_proj_bias"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
-        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
-        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
-    elif key.endswith("out_proj.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
-    elif key.endswith("out_proj.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
-    # rename clip_mapper to clip_txt_pooled_mapper
-    elif key.endswith("clip_mapper.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights
-    elif key.endswith("clip_mapper.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights
-    else:
-        state_dict[key] = orig_state_dict[key]
-
-with accelerate.init_empty_weights():
-    decoder = StableCascadeUNet(
-        in_channels=4,
-        out_channels=4,
-        timestep_ratio_embedding_dim=64,
-        patch_size=2,
-        conditioning_dim=1280,
-        block_out_channels=[320, 640, 1280, 1280],
-        down_num_layers_per_block=[2, 6, 28, 6],
-        up_num_layers_per_block=[6, 28, 6, 2],
-        down_blocks_repeat_mappers=[1, 1, 1, 1],
-        up_blocks_repeat_mappers=[3, 3, 2, 2],
-        num_attention_heads=[0, 0, 20, 20],
-        block_types_per_layer=[
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-        ],
-        clip_text_pooled_in_channels=1280,
-        clip_seq=4,
-        effnet_in_channels=16,
-        pixel_mapper_in_channels=3,
-        kernel_size=3,
-        dropout=[0, 0, 0.1, 0.1],
-        self_attn=True,
-        timestep_conditioning_type=["sca"],
-    )
-load_model_dict_into_meta(decoder, state_dict)
-
-# VQGAN from Wuerstchen-V2
-vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
-
-# Decoder pipeline
-decoder_pipeline = StableCascadeDecoderPipeline(
-    decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
-)
-decoder_pipeline.save_pretrained(f"{args.save_org}/StableCascade-decoder", push_to_hub=args.push_to_hub)
-
-# Stable Cascade combined pipeline
-stable_cascade_pipeline = StableCascadeCombinedPipeline(
-    # Decoder
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    decoder=decoder,
-    scheduler=scheduler,
-    vqgan=vqmodel,
-    # Prior
-    prior_text_encoder=text_encoder,
-    prior_tokenizer=tokenizer,
-    prior_prior=prior_model,
-    prior_scheduler=scheduler,
-    prior_image_encoder=image_encoder,
-    prior_feature_extractor=feature_extractor,
-)
-stable_cascade_pipeline.save_pretrained(f"{args.save_org}/StableCascade", push_to_hub=args.push_to_hub)
@@ -86,7 +86,6 @@ else:
            "MotionAdapter",
            "MultiAdapter",
            "PriorTransformer",
-            "StableCascadeUNet",
            "T2IAdapter",
            "T5FilmDecoder",
            "Transformer2DModel",
@@ -161,7 +160,6 @@ else:
            "SASolverScheduler",
            "SchedulerMixin",
            "ScoreSdeVeScheduler",
-            "TCDScheduler",
            "UnCLIPScheduler",
            "UniPCMultistepScheduler",
            "VQDiffusionScheduler",
@@ -260,9 +258,6 @@ else:
            "SemanticStableDiffusionPipeline",
            "ShapEImg2ImgPipeline",
            "ShapEPipeline",
-            "StableCascadeCombinedPipeline",
-            "StableCascadeDecoderPipeline",
-            "StableCascadePriorPipeline",
            "StableDiffusionAdapterPipeline",
            "StableDiffusionAttendAndExcitePipeline",
            "StableDiffusionControlNetImg2ImgPipeline",
@@ -550,7 +545,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SASolverScheduler,
            SchedulerMixin,
            ScoreSdeVeScheduler,
-            TCDScheduler,
            UnCLIPScheduler,
            UniPCMultistepScheduler,
            VQDiffusionScheduler,
@@ -630,9 +624,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SemanticStableDiffusionPipeline,
            ShapEImg2ImgPipeline,
            ShapEPipeline,
-            StableCascadeCombinedPipeline,
-            StableCascadeDecoderPipeline,
-            StableCascadePriorPipeline,
            StableDiffusionAdapterPipeline,
            StableDiffusionAttendAndExcitePipeline,
            StableDiffusionControlNetImg2ImgPipeline,
@@ -127,7 +127,7 @@ class ConfigMixin:
        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129

-        This function is mostly copied from PyTorch's __getattr__ overwrite:
+        Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite:
        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
        """

@@ -259,10 +259,6 @@ class ConfigMixin:
        model = cls(**init_dict)

        # make sure to also save config parameters that might be used for compatible classes
-        # update _class_name
-        if "_class_name" in hidden_dict:
-            hidden_dict["_class_name"] = cls.__name__
-
        model.register_to_config(**hidden_dict)

        # add hidden kwargs of compatible classes to unused_kwargs
@@ -533,7 +529,7 @@ class ConfigMixin:
                f"{cls.config_name} configuration file."
            )

-        # 5. Give nice info if config attributes are initialized to default because they have not been passed
+        # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
        passed_keys = set(init_dict.keys())
        if len(expected_keys - passed_keys) > 0:
            logger.info(
@@ -332,7 +332,7 @@ class VaeImageProcessor(ConfigMixin):
        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
        height: int,
        width: int,
-        resize_mode: str = "default",  # "default", "fill", "crop"
+        resize_mode: str = "default",  # "defalt", "fill", "crop"
    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
        """
        Resize image.
@@ -448,7 +448,7 @@ class VaeImageProcessor(ConfigMixin):
        image: PipelineImageInput,
        height: Optional[int] = None,
        width: Optional[int] = None,
-        resize_mode: str = "default",  # "default", "fill", "crop"
+        resize_mode: str = "default",  # "defalt", "fill", "crop"
        crops_coords: Optional[Tuple[int, int, int, int]] = None,
    ) -> torch.Tensor:
        """
@@ -479,7 +479,7 @@ class VaeImageProcessor(ConfigMixin):
            if isinstance(image, torch.Tensor):
                # if image is a pytorch tensor could have 2 possible shapes:
                #    1. batch x height x width: we should insert the channel dimension at position 1
-                #    2. channel x height x width: we should insert batch dimension at position 0,
+                #    2. channnel x height x width: we should insert batch dimension at position 0,
                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
                image = image.unsqueeze(1)
@@ -63,20 +63,13 @@ def build_sub_model_components(
            num_in_channels=num_in_channels,
            image_size=image_size,
            torch_dtype=torch_dtype,
-            model_type=model_type,
        )
        return unet_components

    if component_name == "vae":
        scaling_factor = kwargs.get("scaling_factor", None)
        vae_components = create_diffusers_vae_model_from_ldm(
-            pipeline_class_name,
-            original_config,
-            checkpoint,
-            image_size,
-            scaling_factor,
-            torch_dtype,
-            model_type=model_type,
+            pipeline_class_name, original_config, checkpoint, image_size, scaling_factor, torch_dtype
        )
        return vae_components

@@ -131,12 +124,11 @@ def build_sub_model_components(
 def set_additional_components(
    pipeline_class_name,
    original_config,
-    checkpoint=None,
    model_type=None,
 ):
    components = {}
    if pipeline_class_name in REFINER_PIPELINES:
-        model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+        model_type = infer_model_type(original_config, model_type=model_type)
        is_refiner = model_type == "SDXL-Refiner"
        components.update(
            {
@@ -28,7 +28,6 @@ from ..schedulers import (
    DDIMScheduler,
    DDPMScheduler,
    DPMSolverMultistepScheduler,
-    EDMDPMSolverMultistepScheduler,
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
    HeunDiscreteScheduler,
@@ -176,7 +175,6 @@ DIFFUSERS_TO_LDM_MAPPING = {

 LDM_VAE_KEY = "first_stage_model."
 LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
-PLAYGROUND_VAE_SCALING_FACTOR = 0.5
 LDM_UNET_KEY = "model.diffusion_model."
 LDM_CONTROLNET_KEY = "control_model."
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
@@ -307,7 +305,7 @@ def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=
    return original_config


-def infer_model_type(original_config, checkpoint=None, model_type=None):
+def infer_model_type(original_config, model_type=None):
    if model_type is not None:
        return model_type

@@ -325,9 +323,7 @@ def infer_model_type(original_config, checkpoint=None, model_type=None):

    elif has_network_config:
        context_dim = original_config["model"]["params"]["network_config"]["params"]["context_dim"]
-        if "edm_mean" in checkpoint and "edm_std" in checkpoint:
-            model_type = "Playground"
-        elif context_dim == 2048:
+        if context_dim == 2048:
            model_type = "SDXL"
        else:
            model_type = "SDXL-Refiner"
@@ -348,13 +344,13 @@ def set_image_size(pipeline_class_name, original_config, checkpoint, image_size=
        return image_size

    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
-    model_type = infer_model_type(original_config, checkpoint, model_type)
+    model_type = infer_model_type(original_config, model_type)

    if pipeline_class_name == "StableDiffusionUpscalePipeline":
        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
        return image_size

-    elif model_type in ["SDXL", "SDXL-Refiner", "Playground"]:
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
        image_size = 1024
        return image_size

@@ -510,14 +506,12 @@ def create_controlnet_diffusers_config(original_config, image_size: int):
    return controlnet_config


-def create_vae_diffusers_config(original_config, image_size, scaling_factor=None, latents_mean=None, latents_std=None):
+def create_vae_diffusers_config(original_config, image_size, scaling_factor=None):
    """
    Creates a config for the diffusers based on the config of the LDM model.
    """
    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
-    if (scaling_factor is None) and (latents_mean is not None) and (latents_std is not None):
-        scaling_factor = PLAYGROUND_VAE_SCALING_FACTOR
-    elif (scaling_factor is None) and ("scale_factor" in original_config["model"]["params"]):
+    if scaling_factor is None and "scale_factor" in original_config["model"]["params"]:
        scaling_factor = original_config["model"]["params"]["scale_factor"]
    elif scaling_factor is None:
        scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR
@@ -537,8 +531,6 @@ def create_vae_diffusers_config(original_config, image_size, scaling_factor=None
        "layers_per_block": vae_params["num_res_blocks"],
        "scaling_factor": scaling_factor,
    }
-    if latents_mean is not None and latents_std is not None:
-        config.update({"latents_mean": latents_mean, "latents_std": latents_std})

    return config

@@ -1180,7 +1172,6 @@ def create_diffusers_unet_model_from_ldm(
    extract_ema=False,
    image_size=None,
    torch_dtype=None,
-    model_type=None,
 ):
    from ..models import UNet2DConditionModel

@@ -1199,9 +1190,7 @@ def create_diffusers_unet_model_from_ldm(
        else:
            num_in_channels = 4

-    image_size = set_image_size(
-        pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type
-    )
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
    unet_config["in_channels"] = num_in_channels
    unet_config["upcast_attention"] = upcast_attention
@@ -1234,40 +1223,14 @@ def create_diffusers_unet_model_from_ldm(


 def create_diffusers_vae_model_from_ldm(
-    pipeline_class_name,
-    original_config,
-    checkpoint,
-    image_size=None,
-    scaling_factor=None,
-    torch_dtype=None,
-    model_type=None,
+    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None, torch_dtype=None
 ):
    # import here to avoid circular imports
    from ..models import AutoencoderKL

-    image_size = set_image_size(
-        pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type
-    )
-    model_type = infer_model_type(original_config, checkpoint, model_type)
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)

-    if model_type == "Playground":
-        edm_mean = (
-            checkpoint["edm_mean"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_mean"].tolist()
-        )
-        edm_std = (
-            checkpoint["edm_std"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_std"].tolist()
-        )
-    else:
-        edm_mean = None
-        edm_std = None
-
-    vae_config = create_vae_diffusers_config(
-        original_config,
-        image_size=image_size,
-        scaling_factor=scaling_factor,
-        latents_mean=edm_mean,
-        latents_std=edm_std,
-    )
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size, scaling_factor=scaling_factor)
    diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
    ctx = init_empty_weights if is_accelerate_available() else nullcontext

@@ -1302,7 +1265,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
    local_files_only=False,
    torch_dtype=None,
 ):
-    model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+    model_type = infer_model_type(original_config, model_type=model_type)

    if model_type == "FrozenOpenCLIPEmbedder":
        config_name = "stabilityai/stable-diffusion-2"
@@ -1369,7 +1332,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
                "text_encoder_2": text_encoder_2,
            }

-    elif model_type in ["SDXL", "Playground"]:
+    elif model_type == "SDXL":
        try:
            config_name = "openai/clip-vit-large-patch14"
            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
@@ -1420,7 +1383,7 @@ def create_scheduler_from_ldm(
    model_type=None,
 ):
    scheduler_config = get_default_scheduler_config()
-    model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+    model_type = infer_model_type(original_config, model_type=model_type)

    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None

@@ -1443,8 +1406,7 @@ def create_scheduler_from_ldm(

    if model_type in ["SDXL", "SDXL-Refiner"]:
        scheduler_type = "euler"
-    elif model_type == "Playground":
-        scheduler_type = "edm_dpm_solver_multistep"
+
    else:
        beta_start = original_config["model"]["params"].get("linear_start", 0.02)
        beta_end = original_config["model"]["params"].get("linear_end", 0.085)
@@ -1476,26 +1438,6 @@ def create_scheduler_from_ldm(
    elif scheduler_type == "ddim":
        scheduler = DDIMScheduler.from_config(scheduler_config)

-    elif scheduler_type == "edm_dpm_solver_multistep":
-        scheduler_config = {
-            "algorithm_type": "dpmsolver++",
-            "dynamic_thresholding_ratio": 0.995,
-            "euler_at_final": False,
-            "final_sigmas_type": "zero",
-            "lower_order_final": True,
-            "num_train_timesteps": 1000,
-            "prediction_type": "epsilon",
-            "rho": 7.0,
-            "sample_max_value": 1.0,
-            "sigma_data": 0.5,
-            "sigma_max": 80.0,
-            "sigma_min": 0.002,
-            "solver_order": 2,
-            "solver_type": "midpoint",
-            "thresholding": False,
-        }
-        scheduler = EDMDPMSolverMultistepScheduler(**scheduler_config)
-
    else:
        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")

@@ -47,7 +47,6 @@ if is_torch_available():
    _import_structure["unets.unet_kandinsky3"] = ["Kandinsky3UNet"]
    _import_structure["unets.unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
    _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
-    _import_structure["unets.unet_stable_cascade"] = ["StableCascadeUNet"]
    _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
    _import_structure["vq_model"] = ["VQModel"]

@@ -81,7 +80,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            I2VGenXLUNet,
            Kandinsky3UNet,
            MotionAdapter,
-            StableCascadeUNet,
            UNet1DModel,
            UNet2DConditionModel,
            UNet2DModel,
@@ -440,6 +440,7 @@ class TemporalBasicTransformerBlock(nn.Module):

        # Define 3 blocks. Each block has its own normalization layer.
        # 1. Self-Attn
+        self.norm_in = nn.LayerNorm(dim)
        self.ff_in = FeedForward(
            dim,
            dim_out=time_mix_inner_dim,
@@ -97,7 +97,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        norm_eps: float = 1e-5,
        attention_type: str = "default",
        caption_channels: int = None,
-        interpolation_scale: float = None,
    ):
        super().__init__()
        self.use_linear_projection = use_linear_projection
@@ -169,9 +168,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
            self.width = sample_size

            self.patch_size = patch_size
-            interpolation_scale = (
-                interpolation_scale if interpolation_scale is not None else max(self.config.sample_size // 64, 1)
-            )
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
            self.pos_embed = PatchEmbed(
                height=sample_size,
                width=sample_size,
@@ -10,7 +10,6 @@ if is_torch_available():
    from .unet_kandinsky3 import Kandinsky3UNet
    from .unet_motion_model import MotionAdapter, UNetMotionModel
    from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
-    from .unet_stable_cascade import StableCascadeUNet
    from .uvit_2d import UVit2DModel


@@ -99,9 +99,9 @@ class UNet2DModel(ModelMixin, ConfigMixin):
        time_embedding_type: str = "positional",
        freq_shift: int = 0,
        flip_sin_to_cos: bool = True,
-        down_block_types: Tuple[str, ...] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
-        up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
-        block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
+        down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels: Tuple[int] = (224, 448, 672, 896),
        layers_per_block: int = 2,
        mid_block_scale_factor: float = 1,
        downsample_padding: int = 1,
@@ -1,609 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import BaseOutput
-from ..attention_processor import Attention
-from ..modeling_utils import ModelMixin
-
-
-# Copied from diffusers.pipelines.wuerstchen.modeling_wuerstchen_common.WuerstchenLayerNorm with WuerstchenLayerNorm -> SDCascadeLayerNorm
-class SDCascadeLayerNorm(nn.LayerNorm):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
-        x = super().forward(x)
-        return x.permute(0, 3, 1, 2)
-
-
-class SDCascadeTimestepBlock(nn.Module):
-    def __init__(self, c, c_timestep, conds=[]):
-        super().__init__()
-        linear_cls = nn.Linear
-        self.mapper = linear_cls(c_timestep, c * 2)
-        self.conds = conds
-        for cname in conds:
-            setattr(self, f"mapper_{cname}", linear_cls(c_timestep, c * 2))
-
-    def forward(self, x, t):
-        t = t.chunk(len(self.conds) + 1, dim=1)
-        a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
-        for i, c in enumerate(self.conds):
-            ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
-            a, b = a + ac, b + bc
-        return x * (1 + a) + b
-
-
-class SDCascadeResBlock(nn.Module):
-    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
-        super().__init__()
-        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
-        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
-        self.channelwise = nn.Sequential(
-            nn.Linear(c + c_skip, c * 4),
-            nn.GELU(),
-            GlobalResponseNorm(c * 4),
-            nn.Dropout(dropout),
-            nn.Linear(c * 4, c),
-        )
-
-    def forward(self, x, x_skip=None):
-        x_res = x
-        x = self.norm(self.depthwise(x))
-        if x_skip is not None:
-            x = torch.cat([x, x_skip], dim=1)
-        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        return x + x_res
-
-
-# from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
-class GlobalResponseNorm(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
-        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
-
-    def forward(self, x):
-        agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
-        stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
-        return self.gamma * (x * stand_div_norm) + self.beta + x
-
-
-class SDCascadeAttnBlock(nn.Module):
-    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
-        super().__init__()
-        linear_cls = nn.Linear
-
-        self.self_attn = self_attn
-        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
-        self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
-        self.kv_mapper = nn.Sequential(nn.SiLU(), linear_cls(c_cond, c))
-
-    def forward(self, x, kv):
-        kv = self.kv_mapper(kv)
-        norm_x = self.norm(x)
-        if self.self_attn:
-            batch_size, channel, _, _ = x.shape
-            kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
-        x = x + self.attention(norm_x, encoder_hidden_states=kv)
-        return x
-
-
-class UpDownBlock2d(nn.Module):
-    def __init__(self, in_channels, out_channels, mode, enabled=True):
-        super().__init__()
-        if mode not in ["up", "down"]:
-            raise ValueError(f"{mode} not supported")
-        interpolation = (
-            nn.Upsample(scale_factor=2 if mode == "up" else 0.5, mode="bilinear", align_corners=True)
-            if enabled
-            else nn.Identity()
-        )
-        mapping = nn.Conv2d(in_channels, out_channels, kernel_size=1)
-        self.blocks = nn.ModuleList([interpolation, mapping] if mode == "up" else [mapping, interpolation])
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-        return x
-
-
-@dataclass
-class StableCascadeUNetOutput(BaseOutput):
-    sample: torch.FloatTensor = None
-
-
-class StableCascadeUNet(ModelMixin, ConfigMixin):
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 16,
-        timestep_ratio_embedding_dim: int = 64,
-        patch_size: int = 1,
-        conditioning_dim: int = 2048,
-        block_out_channels: Tuple[int] = (2048, 2048),
-        num_attention_heads: Tuple[int] = (32, 32),
-        down_num_layers_per_block: Tuple[int] = (8, 24),
-        up_num_layers_per_block: Tuple[int] = (24, 8),
-        down_blocks_repeat_mappers: Optional[Tuple[int]] = (
-            1,
-            1,
-        ),
-        up_blocks_repeat_mappers: Optional[Tuple[int]] = (1, 1),
-        block_types_per_layer: Tuple[Tuple[str]] = (
-            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
-            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
-        ),
-        clip_text_in_channels: Optional[int] = None,
-        clip_text_pooled_in_channels=1280,
-        clip_image_in_channels: Optional[int] = None,
-        clip_seq=4,
-        effnet_in_channels: Optional[int] = None,
-        pixel_mapper_in_channels: Optional[int] = None,
-        kernel_size=3,
-        dropout: Union[float, Tuple[float]] = (0.1, 0.1),
-        self_attn: Union[bool, Tuple[bool]] = True,
-        timestep_conditioning_type: Tuple[str] = ("sca", "crp"),
-        switch_level: Optional[Tuple[bool]] = None,
-    ):
-        """
-
-        Parameters:
-            in_channels (`int`, defaults to 16):
-                Number of channels in the input sample.
-            out_channels (`int`, defaults to 16):
-                Number of channels in the output sample.
-            timestep_ratio_embedding_dim (`int`, defaults to 64):
-                Dimension of the projected time embedding.
-            patch_size (`int`, defaults to 1):
-                Patch size to use for pixel unshuffling layer
-            conditioning_dim (`int`, defaults to 2048):
-                Dimension of the image and text conditional embedding.
-            block_out_channels (Tuple[int], defaults to (2048, 2048)):
-                Tuple of output channels for each block.
-            num_attention_heads (Tuple[int], defaults to (32, 32)):
-                Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention.
-            down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
-                Number of layers in each down block.
-            up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
-                Number of layers in each up block.
-            down_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
-                Number of 1x1 Convolutional layers to repeat in each down block.
-            up_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
-                Number of 1x1 Convolutional layers to repeat in each up block.
-            block_types_per_layer (Tuple[Tuple[str]], optional,
-                defaults to (
-                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
-                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
-                ):
-                Block types used in each layer of the up/down blocks.
-            clip_text_in_channels (`int`, *optional*, defaults to `None`):
-                Number of input channels for CLIP based text conditioning.
-            clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
-                Number of input channels for pooled CLIP text embeddings.
-            clip_image_in_channels (`int`, *optional*):
-                Number of input channels for CLIP based image conditioning.
-            clip_seq (`int`, *optional*, defaults to 4):
-            effnet_in_channels (`int`, *optional*, defaults to `None`):
-                Number of input channels for effnet conditioning.
-            pixel_mapper_in_channels (`int`, defaults to `None`):
-                Number of input channels for pixel mapper conditioning.
-            kernel_size (`int`, *optional*, defaults to 3):
-                Kernel size to use in the block convolutional layers.
-            dropout (Tuple[float], *optional*, defaults to (0.1, 0.1)):
-                Dropout to use per block.
-            self_attn (Union[bool, Tuple[bool]]):
-                Tuple of booleans that determine whether to use self attention in a block or not.
-            timestep_conditioning_type (Tuple[str], defaults to ("sca", "crp")):
-                Timestep conditioning type.
-            switch_level (Optional[Tuple[bool]], *optional*, defaults to `None`):
-                Tuple that indicates whether upsampling or downsampling should be applied in a block
-        """
-
-        super().__init__()
-
-        if len(block_out_channels) != len(down_num_layers_per_block):
-            raise ValueError(
-                f"Number of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
-            )
-
-        elif len(block_out_channels) != len(up_num_layers_per_block):
-            raise ValueError(
-                f"Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
-            )
-
-        elif len(block_out_channels) != len(down_blocks_repeat_mappers):
-            raise ValueError(
-                f"Number of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
-            )
-
-        elif len(block_out_channels) != len(up_blocks_repeat_mappers):
-            raise ValueError(
-                f"Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
-            )
-
-        elif len(block_out_channels) != len(block_types_per_layer):
-            raise ValueError(
-                f"Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: {len(block_out_channels)}"
-            )
-
-        if isinstance(dropout, float):
-            dropout = (dropout,) * len(block_out_channels)
-        if isinstance(self_attn, bool):
-            self_attn = (self_attn,) * len(block_out_channels)
-
-        # CONDITIONING
-        if effnet_in_channels is not None:
-            self.effnet_mapper = nn.Sequential(
-                nn.Conv2d(effnet_in_channels, block_out_channels[0] * 4, kernel_size=1),
-                nn.GELU(),
-                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
-                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
-            )
-        if pixel_mapper_in_channels is not None:
-            self.pixels_mapper = nn.Sequential(
-                nn.Conv2d(pixel_mapper_in_channels, block_out_channels[0] * 4, kernel_size=1),
-                nn.GELU(),
-                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
-                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
-            )
-
-        self.clip_txt_pooled_mapper = nn.Linear(clip_text_pooled_in_channels, conditioning_dim * clip_seq)
-        if clip_text_in_channels is not None:
-            self.clip_txt_mapper = nn.Linear(clip_text_in_channels, conditioning_dim)
-        if clip_image_in_channels is not None:
-            self.clip_img_mapper = nn.Linear(clip_image_in_channels, conditioning_dim * clip_seq)
-        self.clip_norm = nn.LayerNorm(conditioning_dim, elementwise_affine=False, eps=1e-6)
-
-        self.embedding = nn.Sequential(
-            nn.PixelUnshuffle(patch_size),
-            nn.Conv2d(in_channels * (patch_size**2), block_out_channels[0], kernel_size=1),
-            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
-        )
-
-        def get_block(block_type, in_channels, nhead, c_skip=0, dropout=0, self_attn=True):
-            if block_type == "SDCascadeResBlock":
-                return SDCascadeResBlock(in_channels, c_skip, kernel_size=kernel_size, dropout=dropout)
-            elif block_type == "SDCascadeAttnBlock":
-                return SDCascadeAttnBlock(in_channels, conditioning_dim, nhead, self_attn=self_attn, dropout=dropout)
-            elif block_type == "SDCascadeTimestepBlock":
-                return SDCascadeTimestepBlock(
-                    in_channels, timestep_ratio_embedding_dim, conds=timestep_conditioning_type
-                )
-            else:
-                raise ValueError(f"Block type {block_type} not supported")
-
-        # BLOCKS
-        # -- down blocks
-        self.down_blocks = nn.ModuleList()
-        self.down_downscalers = nn.ModuleList()
-        self.down_repeat_mappers = nn.ModuleList()
-        for i in range(len(block_out_channels)):
-            if i > 0:
-                self.down_downscalers.append(
-                    nn.Sequential(
-                        SDCascadeLayerNorm(block_out_channels[i - 1], elementwise_affine=False, eps=1e-6),
-                        UpDownBlock2d(
-                            block_out_channels[i - 1], block_out_channels[i], mode="down", enabled=switch_level[i - 1]
-                        )
-                        if switch_level is not None
-                        else nn.Conv2d(block_out_channels[i - 1], block_out_channels[i], kernel_size=2, stride=2),
-                    )
-                )
-            else:
-                self.down_downscalers.append(nn.Identity())
-
-            down_block = nn.ModuleList()
-            for _ in range(down_num_layers_per_block[i]):
-                for block_type in block_types_per_layer[i]:
-                    block = get_block(
-                        block_type,
-                        block_out_channels[i],
-                        num_attention_heads[i],
-                        dropout=dropout[i],
-                        self_attn=self_attn[i],
-                    )
-                    down_block.append(block)
-            self.down_blocks.append(down_block)
-
-            if down_blocks_repeat_mappers is not None:
-                block_repeat_mappers = nn.ModuleList()
-                for _ in range(down_blocks_repeat_mappers[i] - 1):
-                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
-                self.down_repeat_mappers.append(block_repeat_mappers)
-
-        # -- up blocks
-        self.up_blocks = nn.ModuleList()
-        self.up_upscalers = nn.ModuleList()
-        self.up_repeat_mappers = nn.ModuleList()
-        for i in reversed(range(len(block_out_channels))):
-            if i > 0:
-                self.up_upscalers.append(
-                    nn.Sequential(
-                        SDCascadeLayerNorm(block_out_channels[i], elementwise_affine=False, eps=1e-6),
-                        UpDownBlock2d(
-                            block_out_channels[i], block_out_channels[i - 1], mode="up", enabled=switch_level[i - 1]
-                        )
-                        if switch_level is not None
-                        else nn.ConvTranspose2d(
-                            block_out_channels[i], block_out_channels[i - 1], kernel_size=2, stride=2
-                        ),
-                    )
-                )
-            else:
-                self.up_upscalers.append(nn.Identity())
-
-            up_block = nn.ModuleList()
-            for j in range(up_num_layers_per_block[::-1][i]):
-                for k, block_type in enumerate(block_types_per_layer[i]):
-                    c_skip = block_out_channels[i] if i < len(block_out_channels) - 1 and j == k == 0 else 0
-                    block = get_block(
-                        block_type,
-                        block_out_channels[i],
-                        num_attention_heads[i],
-                        c_skip=c_skip,
-                        dropout=dropout[i],
-                        self_attn=self_attn[i],
-                    )
-                    up_block.append(block)
-            self.up_blocks.append(up_block)
-
-            if up_blocks_repeat_mappers is not None:
-                block_repeat_mappers = nn.ModuleList()
-                for _ in range(up_blocks_repeat_mappers[::-1][i] - 1):
-                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
-                self.up_repeat_mappers.append(block_repeat_mappers)
-
-        # OUTPUT
-        self.clf = nn.Sequential(
-            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
-            nn.Conv2d(block_out_channels[0], out_channels * (patch_size**2), kernel_size=1),
-            nn.PixelShuffle(patch_size),
-        )
-
-        self.gradient_checkpointing = False
-
-    def _set_gradient_checkpointing(self, value=False):
-        self.gradient_checkpointing = value
-
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv2d, nn.Linear)):
-            torch.nn.init.xavier_uniform_(m.weight)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-
-        nn.init.normal_(self.clip_txt_pooled_mapper.weight, std=0.02)
-        nn.init.normal_(self.clip_txt_mapper.weight, std=0.02) if hasattr(self, "clip_txt_mapper") else None
-        nn.init.normal_(self.clip_img_mapper.weight, std=0.02) if hasattr(self, "clip_img_mapper") else None
-
-        if hasattr(self, "effnet_mapper"):
-            nn.init.normal_(self.effnet_mapper[0].weight, std=0.02)  # conditionings
-            nn.init.normal_(self.effnet_mapper[2].weight, std=0.02)  # conditionings
-
-        if hasattr(self, "pixels_mapper"):
-            nn.init.normal_(self.pixels_mapper[0].weight, std=0.02)  # conditionings
-            nn.init.normal_(self.pixels_mapper[2].weight, std=0.02)  # conditionings
-
-        torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02)  # inputs
-        nn.init.constant_(self.clf[1].weight, 0)  # outputs
-
-        # blocks
-        for level_block in self.down_blocks + self.up_blocks:
-            for block in level_block:
-                if isinstance(block, SDCascadeResBlock):
-                    block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks[0]))
-                elif isinstance(block, SDCascadeTimestepBlock):
-                    nn.init.constant_(block.mapper.weight, 0)
-
-    def get_timestep_ratio_embedding(self, timestep_ratio, max_positions=10000):
-        r = timestep_ratio * max_positions
-        half_dim = self.config.timestep_ratio_embedding_dim // 2
-
-        emb = math.log(max_positions) / (half_dim - 1)
-        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
-        emb = r[:, None] * emb[None, :]
-        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
-
-        if self.config.timestep_ratio_embedding_dim % 2 == 1:  # zero pad
-            emb = nn.functional.pad(emb, (0, 1), mode="constant")
-
-        return emb.to(dtype=r.dtype)
-
-    def get_clip_embeddings(self, clip_txt_pooled, clip_txt=None, clip_img=None):
-        if len(clip_txt_pooled.shape) == 2:
-            clip_txt_pool = clip_txt_pooled.unsqueeze(1)
-        clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(
-            clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.config.clip_seq, -1
-        )
-        if clip_txt is not None and clip_img is not None:
-            clip_txt = self.clip_txt_mapper(clip_txt)
-            if len(clip_img.shape) == 2:
-                clip_img = clip_img.unsqueeze(1)
-            clip_img = self.clip_img_mapper(clip_img).view(
-                clip_img.size(0), clip_img.size(1) * self.config.clip_seq, -1
-            )
-            clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
-        else:
-            clip = clip_txt_pool
-        return self.clip_norm(clip)
-
-    def _down_encode(self, x, r_embed, clip):
-        level_outputs = []
-        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
-
-        if self.training and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            for down_block, downscaler, repmap in block_group:
-                x = downscaler(x)
-                for i in range(len(repmap) + 1):
-                    for block in down_block:
-                        if isinstance(block, SDCascadeResBlock):
-                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
-                        elif isinstance(block, SDCascadeAttnBlock):
-                            x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), x, clip, use_reentrant=False
-                            )
-                        elif isinstance(block, SDCascadeTimestepBlock):
-                            x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), x, r_embed, use_reentrant=False
-                            )
-                        else:
-                            x = x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), use_reentrant=False
-                            )
-                    if i < len(repmap):
-                        x = repmap[i](x)
-                level_outputs.insert(0, x)
-        else:
-            for down_block, downscaler, repmap in block_group:
-                x = downscaler(x)
-                for i in range(len(repmap) + 1):
-                    for block in down_block:
-                        if isinstance(block, SDCascadeResBlock):
-                            x = block(x)
-                        elif isinstance(block, SDCascadeAttnBlock):
-                            x = block(x, clip)
-                        elif isinstance(block, SDCascadeTimestepBlock):
-                            x = block(x, r_embed)
-                        else:
-                            x = block(x)
-                    if i < len(repmap):
-                        x = repmap[i](x)
-                level_outputs.insert(0, x)
-        return level_outputs
-
-    def _up_decode(self, level_outputs, r_embed, clip):
-        x = level_outputs[0]
-        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
-
-        if self.training and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            for i, (up_block, upscaler, repmap) in enumerate(block_group):
-                for j in range(len(repmap) + 1):
-                    for k, block in enumerate(up_block):
-                        if isinstance(block, SDCascadeResBlock):
-                            skip = level_outputs[i] if k == 0 and i > 0 else None
-                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
-                                x = torch.nn.functional.interpolate(
-                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
-                                )
-                            x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), x, skip, use_reentrant=False
-                            )
-                        elif isinstance(block, SDCascadeAttnBlock):
-                            x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), x, clip, use_reentrant=False
-                            )
-                        elif isinstance(block, SDCascadeTimestepBlock):
-                            x = torch.utils.checkpoint.checkpoint(
-                                create_custom_forward(block), x, r_embed, use_reentrant=False
-                            )
-                        else:
-                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
-                    if j < len(repmap):
-                        x = repmap[j](x)
-                x = upscaler(x)
-        else:
-            for i, (up_block, upscaler, repmap) in enumerate(block_group):
-                for j in range(len(repmap) + 1):
-                    for k, block in enumerate(up_block):
-                        if isinstance(block, SDCascadeResBlock):
-                            skip = level_outputs[i] if k == 0 and i > 0 else None
-                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
-                                x = torch.nn.functional.interpolate(
-                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
-                                )
-                            x = block(x, skip)
-                        elif isinstance(block, SDCascadeAttnBlock):
-                            x = block(x, clip)
-                        elif isinstance(block, SDCascadeTimestepBlock):
-                            x = block(x, r_embed)
-                        else:
-                            x = block(x)
-                    if j < len(repmap):
-                        x = repmap[j](x)
-                x = upscaler(x)
-        return x
-
-    def forward(
-        self,
-        sample,
-        timestep_ratio,
-        clip_text_pooled,
-        clip_text=None,
-        clip_img=None,
-        effnet=None,
-        pixels=None,
-        sca=None,
-        crp=None,
-        return_dict=True,
-    ):
-        if pixels is None:
-            pixels = sample.new_zeros(sample.size(0), 3, 8, 8)
-
-        # Process the conditioning embeddings
-        timestep_ratio_embed = self.get_timestep_ratio_embedding(timestep_ratio)
-        for c in self.config.timestep_conditioning_type:
-            if c == "sca":
-                cond = sca
-            elif c == "crp":
-                cond = crp
-            else:
-                cond = None
-            t_cond = cond or torch.zeros_like(timestep_ratio)
-            timestep_ratio_embed = torch.cat([timestep_ratio_embed, self.get_timestep_ratio_embedding(t_cond)], dim=1)
-        clip = self.get_clip_embeddings(clip_txt_pooled=clip_text_pooled, clip_txt=clip_text, clip_img=clip_img)
-
-        # Model Blocks
-        x = self.embedding(sample)
-        if hasattr(self, "effnet_mapper") and effnet is not None:
-            x = x + self.effnet_mapper(
-                nn.functional.interpolate(effnet, size=x.shape[-2:], mode="bilinear", align_corners=True)
-            )
-        if hasattr(self, "pixels_mapper"):
-            x = x + nn.functional.interpolate(
-                self.pixels_mapper(pixels), size=x.shape[-2:], mode="bilinear", align_corners=True
-            )
-        level_outputs = self._down_encode(x, timestep_ratio_embed, clip)
-        x = self._up_decode(level_outputs, timestep_ratio_embed, clip)
-        sample = self.clf(x)
-
-        if not return_dict:
-            return (sample,)
-        return StableCascadeUNetOutput(sample=sample)
@@ -176,11 +176,6 @@ else:
    _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline"]
    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
-    _import_structure["stable_cascade"] = [
-        "StableCascadeCombinedPipeline",
-        "StableCascadeDecoderPipeline",
-        "StableCascadePriorPipeline",
-    ]
    _import_structure["stable_diffusion"].extend(
        [
            "CLIPImageProjection",
@@ -429,11 +424,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .pixart_alpha import PixArtAlphaPipeline
        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
-        from .stable_cascade import (
-            StableCascadeCombinedPipeline,
-            StableCascadeDecoderPipeline,
-            StableCascadePriorPipeline,
-        )
        from .stable_diffusion import (
            CLIPImageProjection,
            StableDiffusionDepth2ImgPipeline,
@@ -400,22 +400,15 @@ class AnimateDiffPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -516,9 +509,9 @@ class AnimateDiffPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
@@ -478,22 +478,15 @@ class AnimateDiffVideoToVideoPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -596,9 +589,9 @@ class AnimateDiffVideoToVideoPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
@@ -343,7 +343,7 @@ class AutoPipelineForText2Image(ConfigMixin):
        pipeline linked to the pipeline class using pattern matching on pipeline class name.

        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
-        additional memory.
+        additional memoery.

        The pipeline is set in evaluation mode (`model.eval()`) by default.

@@ -616,7 +616,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
        image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class name.

        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
-        additional memory.
+        additional memoery.

        The pipeline is set in evaluation mode (`model.eval()`) by default.

@@ -892,7 +892,7 @@ class AutoPipelineForInpainting(ConfigMixin):
        pipeline linked to the pipeline class using pattern matching on pipeline class name.

        All the modules the pipeline class contain will be used to initialize the new pipeline without reallocating
-        additional memory.
+        additional memoery.

        The pipeline is set in evaluation mode (`model.eval()`) by default.

@@ -510,22 +510,15 @@ class StableDiffusionControlNetPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -733,9 +726,9 @@ class StableDiffusionControlNetPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def check_image(self, image, prompt, prompt_embeds):
@@ -503,22 +503,15 @@ class StableDiffusionControlNetImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -720,9 +713,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -628,22 +628,15 @@ class StableDiffusionControlNetInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -878,9 +871,9 @@ class StableDiffusionControlNetInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -537,22 +537,15 @@ class StableDiffusionXLControlNetInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -824,9 +817,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_control_image(
@@ -515,22 +515,15 @@ class StableDiffusionXLControlNetPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -737,9 +730,9 @@ class StableDiffusionXLControlNetPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -567,22 +567,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -801,9 +794,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
@@ -453,22 +453,15 @@ class LatentConsistencyModelImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -654,9 +647,9 @@ class LatentConsistencyModelImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    @property
@@ -437,22 +437,15 @@ class LatentConsistencyModelPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -586,9 +579,9 @@ class LatentConsistencyModelPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    @property
@@ -582,9 +582,9 @@ class PIAPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
@@ -619,22 +619,15 @@ class PIAPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -1,508 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import importlib
-import os
-import re
-import warnings
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
-
-import torch
-from huggingface_hub import (
-    model_info,
-)
-from packaging import version
-
-from ..utils import (
-    SAFETENSORS_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    get_class_from_dynamic_module,
-    is_peft_available,
-    is_transformers_available,
-    logging,
-)
-from ..utils.torch_utils import is_compiled_module
-
-
-if is_transformers_available():
-    import transformers
-    from transformers import PreTrainedModel
-    from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME
-    from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
-    from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
-from huggingface_hub.utils import validate_hf_hub_args
-
-from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME
-
-
-INDEX_FILE = "diffusion_pytorch_model.bin"
-CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
-DUMMY_MODULES_FOLDER = "diffusers.utils"
-TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
-CONNECTED_PIPES_KEYS = ["prior"]
-
-logger = logging.get_logger(__name__)
-
-LOADABLE_CLASSES = {
-    "diffusers": {
-        "ModelMixin": ["save_pretrained", "from_pretrained"],
-        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
-        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
-        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
-    },
-    "transformers": {
-        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
-        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
-        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
-        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
-        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
-        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
-    },
-    "onnxruntime.training": {
-        "ORTModule": ["save_pretrained", "from_pretrained"],
-    },
-}
-
-ALL_IMPORTABLE_CLASSES = {}
-for library in LOADABLE_CLASSES:
-    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
-
-
-def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
-    """
-    Checking for safetensors compatibility:
-    - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
-      files to know which safetensors files are needed.
-    - The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
-
-    Converting default pytorch serialized filenames to safetensors serialized filenames:
-    - For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
-    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
-      extension is replaced with ".safetensors"
-    """
-    pt_filenames = []
-
-    sf_filenames = set()
-
-    passed_components = passed_components or []
-
-    for filename in filenames:
-        _, extension = os.path.splitext(filename)
-
-        if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
-            continue
-
-        if extension == ".bin":
-            pt_filenames.append(os.path.normpath(filename))
-        elif extension == ".safetensors":
-            sf_filenames.add(os.path.normpath(filename))
-
-    for filename in pt_filenames:
-        #  filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extension = '.bam'
-        path, filename = os.path.split(filename)
-        filename, extension = os.path.splitext(filename)
-
-        if filename.startswith("pytorch_model"):
-            filename = filename.replace("pytorch_model", "model")
-        else:
-            filename = filename
-
-        expected_sf_filename = os.path.normpath(os.path.join(path, filename))
-        expected_sf_filename = f"{expected_sf_filename}.safetensors"
-        if expected_sf_filename not in sf_filenames:
-            logger.warning(f"{expected_sf_filename} not found")
-            return False
-
-    return True
-
-
-def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
-    weight_names = [
-        WEIGHTS_NAME,
-        SAFETENSORS_WEIGHTS_NAME,
-        FLAX_WEIGHTS_NAME,
-        ONNX_WEIGHTS_NAME,
-        ONNX_EXTERNAL_WEIGHTS_NAME,
-    ]
-
-    if is_transformers_available():
-        weight_names += [TRANSFORMERS_WEIGHTS_NAME, TRANSFORMERS_SAFE_WEIGHTS_NAME, TRANSFORMERS_FLAX_WEIGHTS_NAME]
-
-    # model_pytorch, diffusion_model_pytorch, ...
-    weight_prefixes = [w.split(".")[0] for w in weight_names]
-    # .bin, .safetensors, ...
-    weight_suffixs = [w.split(".")[-1] for w in weight_names]
-    # -00001-of-00002
-    transformers_index_format = r"\d{5}-of-\d{5}"
-
-    if variant is not None:
-        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
-        variant_file_re = re.compile(
-            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
-        )
-        # `text_encoder/pytorch_model.bin.index.fp16.json`
-        variant_index_re = re.compile(
-            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
-        )
-
-    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
-    non_variant_file_re = re.compile(
-        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
-    )
-    # `text_encoder/pytorch_model.bin.index.json`
-    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
-
-    if variant is not None:
-        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
-        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
-        variant_filenames = variant_weights | variant_indexes
-    else:
-        variant_filenames = set()
-
-    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
-    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
-    non_variant_filenames = non_variant_weights | non_variant_indexes
-
-    # all variant filenames will be used by default
-    usable_filenames = set(variant_filenames)
-
-    def convert_to_variant(filename):
-        if "index" in filename:
-            variant_filename = filename.replace("index", f"index.{variant}")
-        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
-            variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
-        else:
-            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
-        return variant_filename
-
-    for f in non_variant_filenames:
-        variant_filename = convert_to_variant(f)
-        if variant_filename not in usable_filenames:
-            usable_filenames.add(f)
-
-    return usable_filenames, variant_filenames
-
-
-@validate_hf_hub_args
-def warn_deprecated_model_variant(pretrained_model_name_or_path, token, variant, revision, model_filenames):
-    info = model_info(
-        pretrained_model_name_or_path,
-        token=token,
-        revision=None,
-    )
-    filenames = {sibling.rfilename for sibling in info.siblings}
-    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
-    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
-
-    if set(model_filenames).issubset(set(comp_model_filenames)):
-        warnings.warn(
-            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
-            FutureWarning,
-        )
-    else:
-        warnings.warn(
-            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
-            FutureWarning,
-        )
-
-
-def _unwrap_model(model):
-    """Unwraps a model."""
-    if is_compiled_module(model):
-        model = model._orig_mod
-
-    if is_peft_available():
-        from peft import PeftModel
-
-        if isinstance(model, PeftModel):
-            model = model.base_model.model
-
-    return model
-
-
-def maybe_raise_or_warn(
-    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
-):
-    """Simple helper method to raise or warn in case incorrect module has been passed"""
-    if not is_pipeline_module:
-        library = importlib.import_module(library_name)
-        class_obj = getattr(library, class_name)
-        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
-
-        expected_class_obj = None
-        for class_name, class_candidate in class_candidates.items():
-            if class_candidate is not None and issubclass(class_obj, class_candidate):
-                expected_class_obj = class_candidate
-
-        # Dynamo wraps the original model in a private class.
-        # I didn't find a public API to get the original class.
-        sub_model = passed_class_obj[name]
-        unwrapped_sub_model = _unwrap_model(sub_model)
-        model_cls = unwrapped_sub_model.__class__
-
-        if not issubclass(model_cls, expected_class_obj):
-            raise ValueError(
-                f"{passed_class_obj[name]} is of type: {model_cls}, but should be" f" {expected_class_obj}"
-            )
-    else:
-        logger.warning(
-            f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
-            " has the correct type"
-        )
-
-
-def get_class_obj_and_candidates(
-    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
-):
-    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
-    component_folder = os.path.join(cache_dir, component_name)
-
-    if is_pipeline_module:
-        pipeline_module = getattr(pipelines, library_name)
-
-        class_obj = getattr(pipeline_module, class_name)
-        class_candidates = {c: class_obj for c in importable_classes.keys()}
-    elif os.path.isfile(os.path.join(component_folder, library_name + ".py")):
-        # load custom component
-        class_obj = get_class_from_dynamic_module(
-            component_folder, module_file=library_name + ".py", class_name=class_name
-        )
-        class_candidates = {c: class_obj for c in importable_classes.keys()}
-    else:
-        # else we just import it from the library.
-        library = importlib.import_module(library_name)
-
-        class_obj = getattr(library, class_name)
-        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
-
-    return class_obj, class_candidates
-
-
-def _get_pipeline_class(
-    class_obj,
-    config=None,
-    load_connected_pipeline=False,
-    custom_pipeline=None,
-    repo_id=None,
-    hub_revision=None,
-    class_name=None,
-    cache_dir=None,
-    revision=None,
-):
-    if custom_pipeline is not None:
-        if custom_pipeline.endswith(".py"):
-            path = Path(custom_pipeline)
-            # decompose into folder & file
-            file_name = path.name
-            custom_pipeline = path.parent.absolute()
-        elif repo_id is not None:
-            file_name = f"{custom_pipeline}.py"
-            custom_pipeline = repo_id
-        else:
-            file_name = CUSTOM_PIPELINE_FILE_NAME
-
-        if repo_id is not None and hub_revision is not None:
-            # if we load the pipeline code from the Hub
-            # make sure to overwrite the `revision`
-            revision = hub_revision
-
-        return get_class_from_dynamic_module(
-            custom_pipeline,
-            module_file=file_name,
-            class_name=class_name,
-            cache_dir=cache_dir,
-            revision=revision,
-        )
-
-    if class_obj.__name__ != "DiffusionPipeline":
-        return class_obj
-
-    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    class_name = class_name or config["_class_name"]
-    if not class_name:
-        raise ValueError(
-            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
-        )
-
-    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
-
-    pipeline_cls = getattr(diffusers_module, class_name)
-
-    if load_connected_pipeline:
-        from .auto_pipeline import _get_connected_pipeline
-
-        connected_pipeline_cls = _get_connected_pipeline(pipeline_cls)
-        if connected_pipeline_cls is not None:
-            logger.info(
-                f"Loading connected pipeline {connected_pipeline_cls.__name__} instead of {pipeline_cls.__name__} as specified via `load_connected_pipeline=True`"
-            )
-        else:
-            logger.info(f"{pipeline_cls.__name__} has no connected pipeline class. Loading {pipeline_cls.__name__}.")
-
-        pipeline_cls = connected_pipeline_cls or pipeline_cls
-
-    return pipeline_cls
-
-
-def load_sub_model(
-    library_name: str,
-    class_name: str,
-    importable_classes: List[Any],
-    pipelines: Any,
-    is_pipeline_module: bool,
-    pipeline_class: Any,
-    torch_dtype: torch.dtype,
-    provider: Any,
-    sess_options: Any,
-    device_map: Optional[Union[Dict[str, torch.device], str]],
-    max_memory: Optional[Dict[Union[int, str], Union[int, str]]],
-    offload_folder: Optional[Union[str, os.PathLike]],
-    offload_state_dict: bool,
-    model_variants: Dict[str, str],
-    name: str,
-    from_flax: bool,
-    variant: str,
-    low_cpu_mem_usage: bool,
-    cached_folder: Union[str, os.PathLike],
-):
-    """Helper method to load the module `name` from `library_name` and `class_name`"""
-    # retrieve class candidates
-    class_obj, class_candidates = get_class_obj_and_candidates(
-        library_name,
-        class_name,
-        importable_classes,
-        pipelines,
-        is_pipeline_module,
-        component_name=name,
-        cache_dir=cached_folder,
-    )
-
-    load_method_name = None
-    # retrieve load method name
-    for class_name, class_candidate in class_candidates.items():
-        if class_candidate is not None and issubclass(class_obj, class_candidate):
-            load_method_name = importable_classes[class_name][1]
-
-    # if load method name is None, then we have a dummy module -> raise Error
-    if load_method_name is None:
-        none_module = class_obj.__module__
-        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
-            TRANSFORMERS_DUMMY_MODULES_FOLDER
-        )
-        if is_dummy_path and "dummy" in none_module:
-            # call class_obj for nice error message of missing requirements
-            class_obj()
-
-        raise ValueError(
-            f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
-            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
-        )
-
-    load_method = getattr(class_obj, load_method_name)
-
-    # add kwargs to loading method
-    diffusers_module = importlib.import_module(__name__.split(".")[0])
-    loading_kwargs = {}
-    if issubclass(class_obj, torch.nn.Module):
-        loading_kwargs["torch_dtype"] = torch_dtype
-    if issubclass(class_obj, diffusers_module.OnnxRuntimeModel):
-        loading_kwargs["provider"] = provider
-        loading_kwargs["sess_options"] = sess_options
-
-    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
-
-    if is_transformers_available():
-        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
-    else:
-        transformers_version = "N/A"
-
-    is_transformers_model = (
-        is_transformers_available()
-        and issubclass(class_obj, PreTrainedModel)
-        and transformers_version >= version.parse("4.20.0")
-    )
-
-    # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
-    # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
-    # This makes sure that the weights won't be initialized which significantly speeds up loading.
-    if is_diffusers_model or is_transformers_model:
-        loading_kwargs["device_map"] = device_map
-        loading_kwargs["max_memory"] = max_memory
-        loading_kwargs["offload_folder"] = offload_folder
-        loading_kwargs["offload_state_dict"] = offload_state_dict
-        loading_kwargs["variant"] = model_variants.pop(name, None)
-
-        if from_flax:
-            loading_kwargs["from_flax"] = True
-
-        # the following can be deleted once the minimum required `transformers` version
-        # is higher than 4.27
-        if (
-            is_transformers_model
-            and loading_kwargs["variant"] is not None
-            and transformers_version < version.parse("4.27.0")
-        ):
-            raise ImportError(
-                f"When passing `variant='{variant}'`, please make sure to upgrade your `transformers` version to at least 4.27.0.dev0"
-            )
-        elif is_transformers_model and loading_kwargs["variant"] is None:
-            loading_kwargs.pop("variant")
-
-        # if `from_flax` and model is transformer model, can currently not load with `low_cpu_mem_usage`
-        if not (from_flax and is_transformers_model):
-            loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
-        else:
-            loading_kwargs["low_cpu_mem_usage"] = False
-
-    # check if the module is in a subdirectory
-    if os.path.isdir(os.path.join(cached_folder, name)):
-        loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
-    else:
-        # else load from the root directory
-        loaded_sub_model = load_method(cached_folder, **loading_kwargs)
-
-    return loaded_sub_model
-
-
-def _fetch_class_library_tuple(module):
-    # import it here to avoid circular import
-    diffusers_module = importlib.import_module(__name__.split(".")[0])
-    pipelines = getattr(diffusers_module, "pipelines")
-
-    # register the config from the original module, not the dynamo compiled one
-    not_compiled_module = _unwrap_model(module)
-    library = not_compiled_module.__module__.split(".")[0]
-
-    # check if the module is a pipeline module
-    module_path_items = not_compiled_module.__module__.split(".")
-    pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
-
-    path = not_compiled_module.__module__.split(".")
-    is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
-
-    # if library is not in LOADABLE_CLASSES, then it is a custom module.
-    # Or if it's a pipeline module, then the module is inside the pipeline
-    # folder so we set the library to module name.
-    if is_pipeline_module:
-        library = pipeline_dir
-    elif library not in LOADABLE_CLASSES:
-        library = not_compiled_module.__module__
-
-    # retrieve class_name
-    class_name = not_compiled_module.__class__.__name__
-
-    return (library, class_name)
@@ -19,6 +19,7 @@ import inspect
 import os
 import re
 import sys
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -48,44 +49,72 @@ from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
    CONFIG_NAME,
    DEPRECATED_REVISION_ARGS,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
    BaseOutput,
-    PushToHubMixin,
    deprecate,
+    get_class_from_dynamic_module,
    is_accelerate_available,
    is_accelerate_version,
+    is_peft_available,
    is_torch_version,
+    is_transformers_available,
    logging,
    numpy_to_pil,
 )
 from ..utils.hub_utils import load_or_create_model_card, populate_model_card
 from ..utils.torch_utils import is_compiled_module
-from .pipeline_loading_utils import (
-    ALL_IMPORTABLE_CLASSES,
-    CONNECTED_PIPES_KEYS,
-    CUSTOM_PIPELINE_FILE_NAME,
-    LOADABLE_CLASSES,
-    _fetch_class_library_tuple,
-    _get_pipeline_class,
-    _unwrap_model,
-    is_safetensors_compatible,
-    load_sub_model,
-    maybe_raise_or_warn,
-    variant_compatible_siblings,
-    warn_deprecated_model_variant,
-)
+
+
+if is_transformers_available():
+    import transformers
+    from transformers import PreTrainedModel
+    from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME
+    from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
+    from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
+
+from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PushToHubMixin


 if is_accelerate_available():
    import accelerate


-LIBRARIES = []
-for library in LOADABLE_CLASSES:
-    LIBRARIES.append(library)
+INDEX_FILE = "diffusion_pytorch_model.bin"
+CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
+DUMMY_MODULES_FOLDER = "diffusers.utils"
+TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
+CONNECTED_PIPES_KEYS = ["prior"]
+

 logger = logging.get_logger(__name__)


+LOADABLE_CLASSES = {
+    "diffusers": {
+        "ModelMixin": ["save_pretrained", "from_pretrained"],
+        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
+        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
+        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
+    },
+    "transformers": {
+        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
+        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+    },
+    "onnxruntime.training": {
+        "ORTModule": ["save_pretrained", "from_pretrained"],
+    },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
@dataclass
 class ImagePipelineOutput(BaseOutput):
    """
@@ -113,6 +142,432 @@ class AudioPipelineOutput(BaseOutput):
    audios: np.ndarray


+def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
+    """
+    Checking for safetensors compatibility:
+    - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
+      files to know which safetensors files are needed.
+    - The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
+
+    Converting default pytorch serialized filenames to safetensors serialized filenames:
+    - For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
+    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
+      extension is replaced with ".safetensors"
+    """
+    pt_filenames = []
+
+    sf_filenames = set()
+
+    passed_components = passed_components or []
+
+    for filename in filenames:
+        _, extension = os.path.splitext(filename)
+
+        if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
+            continue
+
+        if extension == ".bin":
+            pt_filenames.append(os.path.normpath(filename))
+        elif extension == ".safetensors":
+            sf_filenames.add(os.path.normpath(filename))
+
+    for filename in pt_filenames:
+        #  filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extension = '.bam'
+        path, filename = os.path.split(filename)
+        filename, extension = os.path.splitext(filename)
+
+        if filename.startswith("pytorch_model"):
+            filename = filename.replace("pytorch_model", "model")
+        else:
+            filename = filename
+
+        expected_sf_filename = os.path.normpath(os.path.join(path, filename))
+        expected_sf_filename = f"{expected_sf_filename}.safetensors"
+        if expected_sf_filename not in sf_filenames:
+            logger.warning(f"{expected_sf_filename} not found")
+            return False
+
+    return True
+
+
+def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
+    weight_names = [
+        WEIGHTS_NAME,
+        SAFETENSORS_WEIGHTS_NAME,
+        FLAX_WEIGHTS_NAME,
+        ONNX_WEIGHTS_NAME,
+        ONNX_EXTERNAL_WEIGHTS_NAME,
+    ]
+
+    if is_transformers_available():
+        weight_names += [TRANSFORMERS_WEIGHTS_NAME, TRANSFORMERS_SAFE_WEIGHTS_NAME, TRANSFORMERS_FLAX_WEIGHTS_NAME]
+
+    # model_pytorch, diffusion_model_pytorch, ...
+    weight_prefixes = [w.split(".")[0] for w in weight_names]
+    # .bin, .safetensors, ...
+    weight_suffixs = [w.split(".")[-1] for w in weight_names]
+    # -00001-of-00002
+    transformers_index_format = r"\d{5}-of-\d{5}"
+
+    if variant is not None:
+        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
+        variant_file_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
+        )
+        # `text_encoder/pytorch_model.bin.index.fp16.json`
+        variant_index_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
+        )
+
+    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
+    non_variant_file_re = re.compile(
+        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
+    )
+    # `text_encoder/pytorch_model.bin.index.json`
+    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
+
+    if variant is not None:
+        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
+        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
+        variant_filenames = variant_weights | variant_indexes
+    else:
+        variant_filenames = set()
+
+    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
+    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
+    non_variant_filenames = non_variant_weights | non_variant_indexes
+
+    # all variant filenames will be used by default
+    usable_filenames = set(variant_filenames)
+
+    def convert_to_variant(filename):
+        if "index" in filename:
+            variant_filename = filename.replace("index", f"index.{variant}")
+        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
+            variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
+        else:
+            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
+        return variant_filename
+
+    for f in non_variant_filenames:
+        variant_filename = convert_to_variant(f)
+        if variant_filename not in usable_filenames:
+            usable_filenames.add(f)
+
+    return usable_filenames, variant_filenames
+
+
+@validate_hf_hub_args
+def warn_deprecated_model_variant(pretrained_model_name_or_path, token, variant, revision, model_filenames):
+    info = model_info(
+        pretrained_model_name_or_path,
+        token=token,
+        revision=None,
+    )
+    filenames = {sibling.rfilename for sibling in info.siblings}
+    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
+    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
+
+    if set(model_filenames).issubset(set(comp_model_filenames)):
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
+            FutureWarning,
+        )
+    else:
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
+            FutureWarning,
+        )
+
+
+def _unwrap_model(model):
+    """Unwraps a model."""
+    if is_compiled_module(model):
+        model = model._orig_mod
+
+    if is_peft_available():
+        from peft import PeftModel
+
+        if isinstance(model, PeftModel):
+            model = model.base_model.model
+
+    return model
+
+
+def maybe_raise_or_warn(
+    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
+):
+    """Simple helper method to raise or warn in case incorrect module has been passed"""
+    if not is_pipeline_module:
+        library = importlib.import_module(library_name)
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+        expected_class_obj = None
+        for class_name, class_candidate in class_candidates.items():
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
+                expected_class_obj = class_candidate
+
+        # Dynamo wraps the original model in a private class.
+        # I didn't find a public API to get the original class.
+        sub_model = passed_class_obj[name]
+        unwrapped_sub_model = _unwrap_model(sub_model)
+        model_cls = unwrapped_sub_model.__class__
+
+        if not issubclass(model_cls, expected_class_obj):
+            raise ValueError(
+                f"{passed_class_obj[name]} is of type: {model_cls}, but should be" f" {expected_class_obj}"
+            )
+    else:
+        logger.warning(
+            f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+            " has the correct type"
+        )
+
+
+def get_class_obj_and_candidates(
+    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
+):
+    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
+    component_folder = os.path.join(cache_dir, component_name)
+
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+
+        class_obj = getattr(pipeline_module, class_name)
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    elif os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+        # load custom component
+        class_obj = get_class_from_dynamic_module(
+            component_folder, module_file=library_name + ".py", class_name=class_name
+        )
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+    return class_obj, class_candidates
+
+
+def _get_pipeline_class(
+    class_obj,
+    config=None,
+    load_connected_pipeline=False,
+    custom_pipeline=None,
+    repo_id=None,
+    hub_revision=None,
+    class_name=None,
+    cache_dir=None,
+    revision=None,
+):
+    if custom_pipeline is not None:
+        if custom_pipeline.endswith(".py"):
+            path = Path(custom_pipeline)
+            # decompose into folder & file
+            file_name = path.name
+            custom_pipeline = path.parent.absolute()
+        elif repo_id is not None:
+            file_name = f"{custom_pipeline}.py"
+            custom_pipeline = repo_id
+        else:
+            file_name = CUSTOM_PIPELINE_FILE_NAME
+
+        if repo_id is not None and hub_revision is not None:
+            # if we load the pipeline code from the Hub
+            # make sure to overwrite the `revision`
+            revision = hub_revision
+
+        return get_class_from_dynamic_module(
+            custom_pipeline,
+            module_file=file_name,
+            class_name=class_name,
+            cache_dir=cache_dir,
+            revision=revision,
+        )
+
+    if class_obj != DiffusionPipeline:
+        return class_obj
+
+    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
+    class_name = class_name or config["_class_name"]
+    if not class_name:
+        raise ValueError(
+            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
+        )
+
+    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
+
+    pipeline_cls = getattr(diffusers_module, class_name)
+
+    if load_connected_pipeline:
+        from .auto_pipeline import _get_connected_pipeline
+
+        connected_pipeline_cls = _get_connected_pipeline(pipeline_cls)
+        if connected_pipeline_cls is not None:
+            logger.info(
+                f"Loading connected pipeline {connected_pipeline_cls.__name__} instead of {pipeline_cls.__name__} as specified via `load_connected_pipeline=True`"
+            )
+        else:
+            logger.info(f"{pipeline_cls.__name__} has no connected pipeline class. Loading {pipeline_cls.__name__}.")
+
+        pipeline_cls = connected_pipeline_cls or pipeline_cls
+
+    return pipeline_cls
+
+
+def load_sub_model(
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    pipeline_class: Any,
+    torch_dtype: torch.dtype,
+    provider: Any,
+    sess_options: Any,
+    device_map: Optional[Union[Dict[str, torch.device], str]],
+    max_memory: Optional[Dict[Union[int, str], Union[int, str]]],
+    offload_folder: Optional[Union[str, os.PathLike]],
+    offload_state_dict: bool,
+    model_variants: Dict[str, str],
+    name: str,
+    from_flax: bool,
+    variant: str,
+    low_cpu_mem_usage: bool,
+    cached_folder: Union[str, os.PathLike],
+):
+    """Helper method to load the module `name` from `library_name` and `class_name`"""
+    # retrieve class candidates
+    class_obj, class_candidates = get_class_obj_and_candidates(
+        library_name,
+        class_name,
+        importable_classes,
+        pipelines,
+        is_pipeline_module,
+        component_name=name,
+        cache_dir=cached_folder,
+    )
+
+    load_method_name = None
+    # retrieve load method name
+    for class_name, class_candidate in class_candidates.items():
+        if class_candidate is not None and issubclass(class_obj, class_candidate):
+            load_method_name = importable_classes[class_name][1]
+
+    # if load method name is None, then we have a dummy module -> raise Error
+    if load_method_name is None:
+        none_module = class_obj.__module__
+        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+            TRANSFORMERS_DUMMY_MODULES_FOLDER
+        )
+        if is_dummy_path and "dummy" in none_module:
+            # call class_obj for nice error message of missing requirements
+            class_obj()
+
+        raise ValueError(
+            f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
+            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+        )
+
+    load_method = getattr(class_obj, load_method_name)
+
+    # add kwargs to loading method
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    loading_kwargs = {}
+    if issubclass(class_obj, torch.nn.Module):
+        loading_kwargs["torch_dtype"] = torch_dtype
+    if issubclass(class_obj, diffusers_module.OnnxRuntimeModel):
+        loading_kwargs["provider"] = provider
+        loading_kwargs["sess_options"] = sess_options
+
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+
+    # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
+    # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
+    # This makes sure that the weights won't be initialized which significantly speeds up loading.
+    if is_diffusers_model or is_transformers_model:
+        loading_kwargs["device_map"] = device_map
+        loading_kwargs["max_memory"] = max_memory
+        loading_kwargs["offload_folder"] = offload_folder
+        loading_kwargs["offload_state_dict"] = offload_state_dict
+        loading_kwargs["variant"] = model_variants.pop(name, None)
+
+        if from_flax:
+            loading_kwargs["from_flax"] = True
+
+        # the following can be deleted once the minimum required `transformers` version
+        # is higher than 4.27
+        if (
+            is_transformers_model
+            and loading_kwargs["variant"] is not None
+            and transformers_version < version.parse("4.27.0")
+        ):
+            raise ImportError(
+                f"When passing `variant='{variant}'`, please make sure to upgrade your `transformers` version to at least 4.27.0.dev0"
+            )
+        elif is_transformers_model and loading_kwargs["variant"] is None:
+            loading_kwargs.pop("variant")
+
+        # if `from_flax` and model is transformer model, can currently not load with `low_cpu_mem_usage`
+        if not (from_flax and is_transformers_model):
+            loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
+        else:
+            loading_kwargs["low_cpu_mem_usage"] = False
+
+    # check if the module is in a subdirectory
+    if os.path.isdir(os.path.join(cached_folder, name)):
+        loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
+    else:
+        # else load from the root directory
+        loaded_sub_model = load_method(cached_folder, **loading_kwargs)
+
+    return loaded_sub_model
+
+
+def _fetch_class_library_tuple(module):
+    # import it here to avoid circular import
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    pipelines = getattr(diffusers_module, "pipelines")
+
+    # register the config from the original module, not the dynamo compiled one
+    not_compiled_module = _unwrap_model(module)
+    library = not_compiled_module.__module__.split(".")[0]
+
+    # check if the module is a pipeline module
+    module_path_items = not_compiled_module.__module__.split(".")
+    pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
+
+    path = not_compiled_module.__module__.split(".")
+    is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+    # if library is not in LOADABLE_CLASSES, then it is a custom module.
+    # Or if it's a pipeline module, then the module is inside the pipeline
+    # folder so we set the library to module name.
+    if is_pipeline_module:
+        library = pipeline_dir
+    elif library not in LOADABLE_CLASSES:
+        library = not_compiled_module.__module__
+
+    # retrieve class_name
+    class_name = not_compiled_module.__class__.__name__
+
+    return (library, class_name)
+
+
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):
    r"""
    Base class for all pipelines.
@@ -624,33 +1079,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        use_onnx = kwargs.pop("use_onnx", None)
        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)

-        if low_cpu_mem_usage and not is_accelerate_available():
-            low_cpu_mem_usage = False
-            logger.warning(
-                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
-                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
-                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
-                " install accelerate\n```\n."
-            )
-
-        if device_map is not None and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `device_map=None`."
-            )
-
-        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `low_cpu_mem_usage=False`."
-            )
-
-        if low_cpu_mem_usage is False and device_map is not None:
-            raise ValueError(
-                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
-                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
-            )
-
        # 1. Download the checkpoints and configs
        # use snapshot download here to get it working from from_pretrained
        if not os.path.isdir(pretrained_model_name_or_path):
@@ -783,6 +1211,33 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
            )

+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
        # import it here to avoid circular import
        from diffusers import pipelines

@@ -133,42 +133,6 @@ ASPECT_RATIO_512_BIN = {
    "4.0": [1024.0, 256.0],
 }

-ASPECT_RATIO_256_BIN = {
-    "0.25": [128.0, 512.0],
-    "0.28": [128.0, 464.0],
-    "0.32": [144.0, 448.0],
-    "0.33": [144.0, 432.0],
-    "0.35": [144.0, 416.0],
-    "0.4": [160.0, 400.0],
-    "0.42": [160.0, 384.0],
-    "0.48": [176.0, 368.0],
-    "0.5": [176.0, 352.0],
-    "0.52": [176.0, 336.0],
-    "0.57": [192.0, 336.0],
-    "0.6": [192.0, 320.0],
-    "0.68": [208.0, 304.0],
-    "0.72": [208.0, 288.0],
-    "0.78": [224.0, 288.0],
-    "0.82": [224.0, 272.0],
-    "0.88": [240.0, 272.0],
-    "0.94": [240.0, 256.0],
-    "1.0": [256.0, 256.0],
-    "1.07": [256.0, 240.0],
-    "1.13": [272.0, 240.0],
-    "1.21": [272.0, 224.0],
-    "1.29": [288.0, 224.0],
-    "1.38": [288.0, 208.0],
-    "1.46": [304.0, 208.0],
-    "1.67": [320.0, 192.0],
-    "1.75": [336.0, 192.0],
-    "2.0": [352.0, 176.0],
-    "2.09": [368.0, 176.0],
-    "2.4": [384.0, 160.0],
-    "2.5": [400.0, 160.0],
-    "3.0": [432.0, 144.0],
-    "4.0": [512.0, 128.0],
-}
-

 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
@@ -296,7 +260,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        prompt_attention_mask: Optional[torch.FloatTensor] = None,
        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
        clean_caption: bool = False,
-        max_sequence_length: int = 120,
        **kwargs,
    ):
        r"""
@@ -321,9 +284,8 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
                string.
-            clean_caption (`bool`, defaults to `False`):
+            clean_caption (bool, defaults to `False`):
                If `True`, the function will preprocess and clean the provided caption before encoding.
-            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
        """

        if "mask_feature" in kwargs:
@@ -341,7 +303,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            batch_size = prompt_embeds.shape[0]

        # See Section 3.1. of the paper.
-        max_length = max_sequence_length
+        max_length = 120

        if prompt_embeds is None:
            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
@@ -726,7 +688,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        callback_steps: int = 1,
        clean_caption: bool = True,
        use_resolution_binning: bool = True,
-        max_sequence_length: int = 120,
        **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        """
@@ -796,7 +757,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
                If set to `True`, the requested height and width are first mapped to the closest resolutions using
                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
                the requested resolution. Useful for generating non-square images.
-            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.

        Examples:

@@ -812,14 +772,9 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        height = height or self.transformer.config.sample_size * self.vae_scale_factor
        width = width or self.transformer.config.sample_size * self.vae_scale_factor
        if use_resolution_binning:
-            if self.transformer.config.sample_size == 128:
-                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
-            elif self.transformer.config.sample_size == 64:
-                aspect_ratio_bin = ASPECT_RATIO_512_BIN
-            elif self.transformer.config.sample_size == 32:
-                aspect_ratio_bin = ASPECT_RATIO_256_BIN
-            else:
-                raise ValueError("Invalid sample size")
+            aspect_ratio_bin = (
+                ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN
+            )
            orig_height, orig_width = height, width
            height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)

@@ -867,7 +822,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            prompt_attention_mask=prompt_attention_mask,
            negative_prompt_attention_mask=negative_prompt_attention_mask,
            clean_caption=clean_caption,
-            max_sequence_length=max_sequence_length,
        )
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
@@ -1,50 +0,0 @@
-from typing import TYPE_CHECKING
-
-from ...utils import (
-    DIFFUSERS_SLOW_IMPORT,
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    get_objects_from_module,
-    is_torch_available,
-    is_transformers_available,
-)
-
-
-_dummy_objects = {}
-_import_structure = {}
-
-try:
-    if not (is_transformers_available() and is_torch_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-else:
-    _import_structure["pipeline_stable_cascade"] = ["StableCascadeDecoderPipeline"]
-    _import_structure["pipeline_stable_cascade_combined"] = ["StableCascadeCombinedPipeline"]
-    _import_structure["pipeline_stable_cascade_prior"] = ["StableCascadePriorPipeline"]
-
-
-if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
-    try:
-        if not (is_transformers_available() and is_torch_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
-    else:
-        from .pipeline_stable_cascade import StableCascadeDecoderPipeline
-        from .pipeline_stable_cascade_combined import StableCascadeCombinedPipeline
-        from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-    )
-
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
@@ -1,467 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import torch
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from ...models import StableCascadeUNet
-from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import is_torch_version, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline
-
-        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
-        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
-        ... ).to("cuda")
-        >>> gen_pipe = StableCascadeDecoderPipeline.from_pretrain(
-        ...     "stabilityai/stable-cascade", torch_dtype=torch.float16
-        ... ).to("cuda")
-
-        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
-        >>> prior_output = pipe(prompt)
-        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
-        ```
-"""
-
-
-class StableCascadeDecoderPipeline(DiffusionPipeline):
-    """
-    Pipeline for generating images from the Stable Cascade model.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        tokenizer (`CLIPTokenizer`):
-            The CLIP tokenizer.
-        text_encoder (`CLIPTextModel`):
-            The CLIP text encoder.
-        decoder ([`StableCascadeUNet`]):
-            The Stable Cascade decoder unet.
-        vqgan ([`PaellaVQModel`]):
-            The VQGAN model.
-        scheduler ([`DDPMWuerstchenScheduler`]):
-            A scheduler to be used in combination with `prior` to generate image embedding.
-        latent_dim_scale (float, `optional`, defaults to 10.67):
-            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
-            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
-            width=int(24*10.67)=256 in order to match the training conditions.
-    """
-
-    unet_name = "decoder"
-    text_encoder_name = "text_encoder"
-    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
-    _callback_tensor_inputs = [
-        "latents",
-        "prompt_embeds_pooled",
-        "negative_prompt_embeds",
-        "image_embeddings",
-    ]
-
-    def __init__(
-        self,
-        decoder: StableCascadeUNet,
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModel,
-        scheduler: DDPMWuerstchenScheduler,
-        vqgan: PaellaVQModel,
-        latent_dim_scale: float = 10.67,
-    ) -> None:
-        super().__init__()
-        self.register_modules(
-            decoder=decoder,
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            scheduler=scheduler,
-            vqgan=vqgan,
-        )
-        self.register_to_config(latent_dim_scale=latent_dim_scale)
-
-    def prepare_latents(self, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler):
-        batch_size, channels, height, width = image_embeddings.shape
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            4,
-            int(height * self.config.latent_dim_scale),
-            int(width * self.config.latent_dim_scale),
-        )
-
-        if latents is None:
-            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(device)
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def encode_prompt(
-        self,
-        device,
-        batch_size,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        prompt=None,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-    ):
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            attention_mask = text_inputs.attention_mask
-
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
-
-            text_encoder_output = self.text_encoder(
-                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
-            )
-            prompt_embeds = text_encoder_output.hidden_states[-1]
-            if prompt_embeds_pooled is None:
-                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
-        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if negative_prompt_embeds is None and do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=uncond_input.attention_mask.to(device),
-                output_hidden_states=True,
-            )
-
-            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
-            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            seq_len = negative_prompt_embeds_pooled.shape[1]
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
-                dtype=self.text_encoder.dtype, device=device
-            )
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            # done duplicates
-
-        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
-
-    def check_inputs(
-        self,
-        prompt,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]],
-        prompt: Union[str, List[str]] = None,
-        num_inference_steps: int = 10,
-        guidance_scale: float = 0.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        num_images_per_prompt: int = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`):
-                Image Embeddings either extracted from an image or generated by a Prior Model.
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            num_inference_steps (`int`, *optional*, defaults to 12):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 0.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
-                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
-                linked to the text `prompt`, usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `decoder_guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
-                (`np.array`) or `"pt"` (`torch.Tensor`).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
-            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
-            embeddings.
-        """
-
-        # 0. Define commonly used variables
-        device = self._execution_device
-        dtype = self.decoder.dtype
-        self._guidance_scale = guidance_scale
-        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
-            raise ValueError("`StableCascadeDecoderPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype.")
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-        )
-        if isinstance(image_embeddings, list):
-            image_embeddings = torch.cat(image_embeddings, dim=0)
-        batch_size = image_embeddings.shape[0]
-
-        # 2. Encode caption
-        if prompt_embeds is None and negative_prompt_embeds is None:
-            prompt_embeds, _, negative_prompt_embeds, _ = self.encode_prompt(
-                prompt=prompt,
-                device=device,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-            )
-        prompt_embeds_pooled = (
-            torch.cat([prompt_embeds, negative_prompt_embeds]) if self.do_classifier_free_guidance else prompt_embeds
-        )
-        effnet = (
-            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
-            if self.do_classifier_free_guidance
-            else image_embeddings
-        )
-
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latents
-        latents = self.prepare_latents(
-            image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
-        )
-
-        # 6. Run denoising loop
-        self._num_timesteps = len(timesteps[:-1])
-        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
-            timestep_ratio = t.expand(latents.size(0)).to(dtype)
-
-            # 7. Denoise latents
-            predicted_latents = self.decoder(
-                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
-                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
-                clip_text_pooled=prompt_embeds_pooled,
-                effnet=effnet,
-                return_dict=False,
-            )[0]
-
-            # 8. Check for classifier free guidance and apply it
-            if self.do_classifier_free_guidance:
-                predicted_latents_text, predicted_latents_uncond = predicted_latents.chunk(2)
-                predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
-
-            # 9. Renoise latents to next timestep
-            latents = self.scheduler.step(
-                model_output=predicted_latents,
-                timestep=timestep_ratio,
-                sample=latents,
-                generator=generator,
-            ).prev_sample
-
-            if callback_on_step_end is not None:
-                callback_kwargs = {}
-                for k in callback_on_step_end_tensor_inputs:
-                    callback_kwargs[k] = locals()[k]
-                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                latents = callback_outputs.pop("latents", latents)
-                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-        if output_type not in ["pt", "np", "pil", "latent"]:
-            raise ValueError(
-                f"Only the output types `pt`, `np`, `pil` and `latent` are supported not output_type={output_type}"
-            )
-
-        if not output_type == "latent":
-            # 10. Scale and decode the image latents with vq-vae
-            latents = self.vqgan.config.scale_factor * latents
-            images = self.vqgan.decode(latents).sample.clamp(0, 1)
-            if output_type == "np":
-                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
-            elif output_type == "pil":
-                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
-                images = self.numpy_to_pil(images)
-        else:
-            images = latents
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return images
-        return ImagePipelineOutput(images)
@@ -1,294 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Callable, Dict, List, Optional, Union
-
-import PIL
-import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from ...models import StableCascadeUNet
-from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
-from .pipeline_stable_cascade import StableCascadeDecoderPipeline
-from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
-
-
-TEXT2IMAGE_EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from diffusions import StableCascadeCombinedPipeline
-
-        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade-combined", torch_dtype=torch.bfloat16).to(
-        ...     "cuda"
-        ... )
-        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
-        >>> images = pipe(prompt=prompt)
-        ```
-"""
-
-
-class StableCascadeCombinedPipeline(DiffusionPipeline):
-    """
-    Combined Pipeline for text-to-image generation using Stable Cascade.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        tokenizer (`CLIPTokenizer`):
-            The decoder tokenizer to be used for text inputs.
-        text_encoder (`CLIPTextModel`):
-            The decoder text encoder to be used for text inputs.
-        decoder (`StableCascadeUNet`):
-            The decoder model to be used for decoder image generation pipeline.
-        scheduler (`DDPMWuerstchenScheduler`):
-            The scheduler to be used for decoder image generation pipeline.
-        vqgan (`PaellaVQModel`):
-            The VQGAN model to be used for decoder image generation pipeline.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        prior_prior (`StableCascadeUNet`):
-            The prior model to be used for prior pipeline.
-        prior_scheduler (`DDPMWuerstchenScheduler`):
-            The scheduler to be used for prior pipeline.
-    """
-
-    _load_connected_pipes = True
-
-    def __init__(
-        self,
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModel,
-        decoder: StableCascadeUNet,
-        scheduler: DDPMWuerstchenScheduler,
-        vqgan: PaellaVQModel,
-        prior_prior: StableCascadeUNet,
-        prior_text_encoder: CLIPTextModel,
-        prior_tokenizer: CLIPTokenizer,
-        prior_scheduler: DDPMWuerstchenScheduler,
-        prior_feature_extractor: Optional[CLIPImageProcessor] = None,
-        prior_image_encoder: Optional[CLIPVisionModelWithProjection] = None,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            decoder=decoder,
-            scheduler=scheduler,
-            vqgan=vqgan,
-            prior_text_encoder=prior_text_encoder,
-            prior_tokenizer=prior_tokenizer,
-            prior_prior=prior_prior,
-            prior_scheduler=prior_scheduler,
-            prior_feature_extractor=prior_feature_extractor,
-            prior_image_encoder=prior_image_encoder,
-        )
-        self.prior_pipe = StableCascadePriorPipeline(
-            prior=prior_prior,
-            text_encoder=prior_text_encoder,
-            tokenizer=prior_tokenizer,
-            scheduler=prior_scheduler,
-            image_encoder=prior_image_encoder,
-            feature_extractor=prior_feature_extractor,
-        )
-        self.decoder_pipe = StableCascadeDecoderPipeline(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            decoder=decoder,
-            scheduler=scheduler,
-            vqgan=vqgan,
-        )
-
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
-        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
-
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
-        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
-
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
-        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
-        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
-        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
-        """
-        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
-        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
-
-    def progress_bar(self, iterable=None, total=None):
-        self.prior_pipe.progress_bar(iterable=iterable, total=total)
-        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
-
-    def set_progress_bar_config(self, **kwargs):
-        self.prior_pipe.set_progress_bar_config(**kwargs)
-        self.decoder_pipe.set_progress_bar_config(**kwargs)
-
-    @torch.no_grad()
-    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
-        height: int = 512,
-        width: int = 512,
-        prior_num_inference_steps: int = 60,
-        prior_timesteps: Optional[List[float]] = None,
-        prior_guidance_scale: float = 4.0,
-        num_inference_steps: int = 12,
-        decoder_timesteps: Optional[List[float]] = None,
-        decoder_guidance_scale: float = 0.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        num_images_per_prompt: int = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation for the prior and decoder.
-            images (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, *optional*):
-                The images to guide the image generation for the prior.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
-                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
-                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
-                to the text `prompt`, usually at the expense of lower image quality.
-            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
-                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. For more specific timestep spacing, you can pass customized
-                `prior_timesteps`
-            num_inference_steps (`int`, *optional*, defaults to 12):
-                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. For more specific timestep spacing, you can pass customized
-                `timesteps`
-            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
-                (`np.array`) or `"pt"` (`torch.Tensor`).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-            prior_callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
-                int, callback_kwargs: Dict)`.
-            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
-                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
-                the `._callback_tensor_inputs` attribute of your pipeine class.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
-            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        prior_outputs = self.prior_pipe(
-            prompt=prompt if prompt_embeds is None else None,
-            images=images,
-            height=height,
-            width=width,
-            num_inference_steps=prior_num_inference_steps,
-            guidance_scale=prior_guidance_scale,
-            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            num_images_per_prompt=num_images_per_prompt,
-            generator=generator,
-            latents=latents,
-            output_type="pt",
-            return_dict=True,
-            callback_on_step_end=prior_callback_on_step_end,
-            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
-        )
-        image_embeddings = prior_outputs.image_embeddings
-        prompt_embeds = prior_outputs.get("prompt_embeds", None)
-        negative_prompt_embeds = prior_outputs.get("negative_prompt_embeds", None)
-
-        outputs = self.decoder_pipe(
-            image_embeddings=image_embeddings,
-            prompt=prompt if prompt_embeds is None else None,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=decoder_guidance_scale,
-            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            generator=generator,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback_on_step_end=callback_on_step_end,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-        )
-
-        return outputs
@@ -1,614 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from math import ceil
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from ...models import StableCascadeUNet
-from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import BaseOutput, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import StableCascadePriorPipeline
-
-        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
-        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
-        ... ).to("cuda")
-
-        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
-        >>> prior_output = pipe(prompt)
-        ```
-"""
-
-
-@dataclass
-class StableCascadePriorPipelineOutput(BaseOutput):
-    """
-    Output class for WuerstchenPriorPipeline.
-
-    Args:
-        image_embeddings (`torch.FloatTensor` or `np.ndarray`)
-            Prior image embeddings for text prompt
-        prompt_embeds (`torch.FloatTensor`):
-            Text embeddings for the prompt.
-        negative_prompt_embeds (`torch.FloatTensor`):
-            Text embeddings for the negative prompt.
-    """
-
-    image_embeddings: Union[torch.FloatTensor, np.ndarray]
-    prompt_embeds: Union[torch.FloatTensor, np.ndarray]
-    negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray]
-
-
-class StableCascadePriorPipeline(DiffusionPipeline):
-    """
-    Pipeline for generating image prior for Stable Cascade.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        prior ([`StableCascadeUNet`]):
-            The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        scheduler ([`DDPMWuerstchenScheduler`]):
-            A scheduler to be used in combination with `prior` to generate image embedding.
-        resolution_multiple ('float', *optional*, defaults to 42.67):
-            Default resolution for multiple images generated.
-    """
-
-    unet_name = "prior"
-    text_encoder_name = "text_encoder"
-    model_cpu_offload_seq = "image_encoder->text_encoder->prior"
-    _optional_components = ["image_encoder", "feature_extractor"]
-    _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"]
-
-    def __init__(
-        self,
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
-        prior: StableCascadeUNet,
-        scheduler: DDPMWuerstchenScheduler,
-        resolution_multiple: float = 42.67,
-        feature_extractor: Optional[CLIPImageProcessor] = None,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
-    ) -> None:
-        super().__init__()
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            image_encoder=image_encoder,
-            feature_extractor=feature_extractor,
-            prior=prior,
-            scheduler=scheduler,
-        )
-        self.register_to_config(resolution_multiple=resolution_multiple)
-
-    def prepare_latents(
-        self, batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, scheduler
-    ):
-        latent_shape = (
-            num_images_per_prompt * batch_size,
-            self.prior.config.in_channels,
-            ceil(height / self.config.resolution_multiple),
-            ceil(width / self.config.resolution_multiple),
-        )
-
-        if latents is None:
-            latents = randn_tensor(latent_shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != latent_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latent_shape}")
-            latents = latents.to(device)
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def encode_prompt(
-        self,
-        device,
-        batch_size,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        prompt=None,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-    ):
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            attention_mask = text_inputs.attention_mask
-
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
-
-            text_encoder_output = self.text_encoder(
-                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
-            )
-            prompt_embeds = text_encoder_output.hidden_states[-1]
-            if prompt_embeds_pooled is None:
-                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
-
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
-        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if negative_prompt_embeds is None and do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=uncond_input.attention_mask.to(device),
-                output_hidden_states=True,
-            )
-
-            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
-            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            seq_len = negative_prompt_embeds_pooled.shape[1]
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
-                dtype=self.text_encoder.dtype, device=device
-            )
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            # done duplicates
-
-        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
-
-    def encode_image(self, images, device, dtype, batch_size, num_images_per_prompt):
-        image_embeds = []
-        for image in images:
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-            image = image.to(device=device, dtype=dtype)
-            image_embed = self.image_encoder(image).image_embeds.unsqueeze(1)
-            image_embeds.append(image_embed)
-        image_embeds = torch.cat(image_embeds, dim=1)
-
-        image_embeds = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
-        negative_image_embeds = torch.zeros_like(image_embeds)
-
-        return image_embeds, negative_image_embeds
-
-    def check_inputs(
-        self,
-        prompt,
-        images=None,
-        image_embeds=None,
-        negative_prompt=None,
-        prompt_embeds=None,
-        prompt_embeds_pooled=None,
-        negative_prompt_embeds=None,
-        negative_prompt_embeds_pooled=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if prompt_embeds_pooled is not None and negative_prompt_embeds_pooled is not None:
-            if prompt_embeds_pooled.shape != negative_prompt_embeds_pooled.shape:
-                raise ValueError(
-                    "`prompt_embeds_pooled` and `negative_prompt_embeds_pooled` must have the same shape when passed"
-                    f"directly, but got: `prompt_embeds_pooled` {prompt_embeds_pooled.shape} !="
-                    f"`negative_prompt_embeds_pooled` {negative_prompt_embeds_pooled.shape}."
-                )
-
-        if image_embeds is not None and images is not None:
-            raise ValueError(
-                f"Cannot forward both `images`: {images} and `image_embeds`: {image_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-
-        if images:
-            for i, image in enumerate(images):
-                if not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
-                    raise TypeError(
-                        f"'images' must contain images of type 'torch.Tensor' or 'PIL.Image.Image, but got"
-                        f"{type(image)} for image number {i}."
-                    )
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    def get_t_condioning(self, t, alphas_cumprod):
-        s = torch.tensor([0.003])
-        clamp_range = [0, 1]
-        min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
-        var = alphas_cumprod[t]
-        var = var.clamp(*clamp_range)
-        s, min_var = s.to(var.device), min_var.to(var.device)
-        ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
-        return ratio
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
-        height: int = 1024,
-        width: int = 1024,
-        num_inference_steps: int = 20,
-        timesteps: List[float] = None,
-        guidance_scale: float = 4.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pt",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 1024):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 1024):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 60):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 8.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
-                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
-                linked to the text `prompt`, usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `decoder_guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
-                argument.
-            image_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
-                If not provided, image embeddings will be generated from `image` input argument if existing.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
-                (`np.array`) or `"pt"` (`torch.Tensor`).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated image embeddings.
-        """
-
-        # 0. Define commonly used variables
-        device = self._execution_device
-        dtype = next(self.prior.parameters()).dtype
-        self._guidance_scale = guidance_scale
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            images=images,
-            image_embeds=image_embeds,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            prompt_embeds_pooled=prompt_embeds_pooled,
-            negative_prompt_embeds=negative_prompt_embeds,
-            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-        )
-
-        # 2. Encode caption + images
-        (
-            prompt_embeds,
-            prompt_embeds_pooled,
-            negative_prompt_embeds,
-            negative_prompt_embeds_pooled,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            device=device,
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            prompt_embeds_pooled=prompt_embeds_pooled,
-            negative_prompt_embeds=negative_prompt_embeds,
-            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
-        )
-
-        if images is not None:
-            image_embeds_pooled, uncond_image_embeds_pooled = self.encode_image(
-                images=images,
-                device=device,
-                dtype=dtype,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-            )
-        elif image_embeds is not None:
-            image_embeds_pooled = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
-            uncond_image_embeds_pooled = torch.zeros_like(image_embeds_pooled)
-        else:
-            image_embeds_pooled = torch.zeros(
-                batch_size * num_images_per_prompt,
-                1,
-                self.prior.config.clip_image_in_channels,
-                device=device,
-                dtype=dtype,
-            )
-            uncond_image_embeds_pooled = torch.zeros(
-                batch_size * num_images_per_prompt,
-                1,
-                self.prior.config.clip_image_in_channels,
-                device=device,
-                dtype=dtype,
-            )
-
-        if self.do_classifier_free_guidance:
-            image_embeds = torch.cat([image_embeds_pooled, uncond_image_embeds_pooled], dim=0)
-        else:
-            image_embeds = image_embeds_pooled
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        text_encoder_hidden_states = (
-            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
-        )
-        text_encoder_pooled = (
-            torch.cat([prompt_embeds_pooled, negative_prompt_embeds_pooled])
-            if negative_prompt_embeds is not None
-            else prompt_embeds_pooled
-        )
-
-        # 4. Prepare and set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latents
-        latents = self.prepare_latents(
-            batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
-        )
-
-        if isinstance(self.scheduler, DDPMWuerstchenScheduler):
-            timesteps = timesteps[:-1]
-        else:
-            if self.scheduler.config.clip_sample:
-                self.scheduler.config.clip_sample = False  # disample sample clipping
-                logger.warning(" set `clip_sample` to be False")
-        # 6. Run denoising loop
-        if hasattr(self.scheduler, "betas"):
-            alphas = 1.0 - self.scheduler.betas
-            alphas_cumprod = torch.cumprod(alphas, dim=0)
-        else:
-            alphas_cumprod = []
-
-        self._num_timesteps = len(timesteps)
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
-                if len(alphas_cumprod) > 0:
-                    timestep_ratio = self.get_t_condioning(t.long().cpu(), alphas_cumprod)
-                    timestep_ratio = timestep_ratio.expand(latents.size(0)).to(dtype).to(device)
-                else:
-                    timestep_ratio = t.float().div(self.scheduler.timesteps[-1]).expand(latents.size(0)).to(dtype)
-            else:
-                timestep_ratio = t.expand(latents.size(0)).to(dtype)
-            # 7. Denoise image embeddings
-            predicted_image_embedding = self.prior(
-                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
-                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
-                clip_text_pooled=text_encoder_pooled,
-                clip_text=text_encoder_hidden_states,
-                clip_img=image_embeds,
-                return_dict=False,
-            )[0]
-
-            # 8. Check for classifier free guidance and apply it
-            if self.do_classifier_free_guidance:
-                predicted_image_embedding_text, predicted_image_embedding_uncond = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = torch.lerp(
-                    predicted_image_embedding_uncond, predicted_image_embedding_text, self.guidance_scale
-                )
-
-            # 9. Renoise latents to next timestep
-            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
-                timestep_ratio = t
-            latents = self.scheduler.step(
-                model_output=predicted_image_embedding, timestep=timestep_ratio, sample=latents, generator=generator
-            ).prev_sample
-
-            if callback_on_step_end is not None:
-                callback_kwargs = {}
-                for k in callback_on_step_end_tensor_inputs:
-                    callback_kwargs[k] = locals()[k]
-                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                latents = callback_outputs.pop("latents", latents)
-                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if output_type == "np":
-            latents = latents.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
-            prompt_embeds = prompt_embeds.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
-            negative_prompt_embeds = (
-                negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None
-            )  # float() as bfloat16-> numpy doesnt work
-
-        if not return_dict:
-            return (latents, prompt_embeds, negative_prompt_embeds)
-
-        return StableCascadePriorPipelineOutput(latents, prompt_embeds, negative_prompt_embeds)
@@ -520,22 +520,15 @@ class StableDiffusionPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -646,9 +639,9 @@ class StableDiffusionPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
@@ -564,22 +564,15 @@ class StableDiffusionImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -692,9 +685,9 @@ class StableDiffusionImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, strength, device):
@@ -636,22 +636,15 @@ class StableDiffusionInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -774,9 +767,9 @@ class StableDiffusionInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(
@@ -59,66 +59,6 @@ EXAMPLE_DOC_STRING = """
 """


-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
-                must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
@dataclass
 class LDM3DPipelineOutput(BaseOutput):
    """
@@ -185,7 +125,6 @@ class StableDiffusionLDM3DPipeline(
    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
@@ -503,22 +442,15 @@ class StableDiffusionLDM3DPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -621,9 +553,9 @@ class StableDiffusionLDM3DPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
@@ -643,66 +575,6 @@ class StableDiffusionLDM3DPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        assert len(w.shape) == 1
-        w = w * 1000.0
-
-        half_dim = embedding_dim // 2
-        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
-        emb = w.to(dtype)[:, None] * emb[None, :]
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = torch.nn.functional.pad(emb, (0, 1))
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def guidance_rescale(self):
-        return self._guidance_rescale
-
-    @property
-    def clip_skip(self):
-        return self._clip_skip
-
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-
-    @property
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
@@ -711,7 +583,6 @@ class StableDiffusionLDM3DPipeline(
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 49,
-        timesteps: List[int] = None,
        guidance_scale: float = 5.0,
        negative_prompt: Optional[Union[str, List[str]]] = None,
        num_images_per_prompt: Optional[int] = 1,
@@ -724,12 +595,10 @@ class StableDiffusionLDM3DPipeline(
        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -780,21 +649,18 @@ class StableDiffusionLDM3DPipeline(
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
        Examples:

        Returns:
@@ -804,22 +670,6 @@ class StableDiffusionLDM3DPipeline(
                second element is a list of `bool`s indicating whether the corresponding generated image contains
                "not-safe-for-work" (nsfw) content.
        """
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -835,15 +685,8 @@ class StableDiffusionLDM3DPipeline(
            negative_prompt_embeds,
            ip_adapter_image,
            ip_adapter_image_embeds,
-            callback_on_step_end_tensor_inputs,
        )

-        self._guidance_scale = guidance_scale
-        self._guidance_rescale = guidance_rescale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
@@ -853,6 +696,10 @@ class StableDiffusionLDM3DPipeline(
            batch_size = prompt_embeds.shape[0]

        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0

        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
            image_embeds = self.prepare_ip_adapter_image_embeds(
@@ -860,7 +707,7 @@ class StableDiffusionLDM3DPipeline(
                ip_adapter_image_embeds,
                device,
                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
+                do_classifier_free_guidance,
            )

        # 3. Encode input prompt
@@ -868,7 +715,7 @@ class StableDiffusionLDM3DPipeline(
            prompt,
            device,
            num_images_per_prompt,
-            self.do_classifier_free_guidance,
+            do_classifier_free_guidance,
            negative_prompt,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
@@ -877,11 +724,12 @@ class StableDiffusionLDM3DPipeline(
        # For classifier free guidance, we need to do two forward passes.
        # Here we concatenate the unconditional and text embeddings into a single batch
        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
+        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
@@ -902,24 +750,12 @@ class StableDiffusionLDM3DPipeline(
        # 6.1 Add image embeds for IP-Adapter
        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

-        # 6.2 Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
        # 7. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
@@ -927,34 +763,19 @@ class StableDiffusionLDM3DPipeline(
                    latent_model_input,
                    t,
                    encoder_hidden_states=prompt_embeds,
-                    timestep_cond=timestep_cond,
                    cross_attention_kwargs=cross_attention_kwargs,
                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

                # perform guidance
-                if self.do_classifier_free_guidance:
+                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

-                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
-
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
@@ -414,22 +414,15 @@ class StableDiffusionPanoramaPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -557,9 +550,9 @@ class StableDiffusionPanoramaPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -549,22 +549,15 @@ class StableDiffusionXLPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -678,9 +671,9 @@ class StableDiffusionXLPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -616,9 +616,9 @@ class StableDiffusionXLImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
@@ -782,22 +782,15 @@ class StableDiffusionXLImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -486,22 +486,15 @@ class StableDiffusionXLInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -858,9 +851,9 @@ class StableDiffusionXLInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(
@@ -21,33 +21,16 @@ import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

-from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from ...schedulers import EulerDiscreteScheduler
-from ...utils import BaseOutput, logging, replace_example_docstring
+from ...utils import BaseOutput, logging
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from diffusers import StableVideoDiffusionPipeline
-        >>> from diffusers.utils import load_image, export_to_video
-
-        >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
-        >>> pipe.to("cuda")
-
-        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
-        >>> image = image.resize((1024, 576))
-
-        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
-        >>> export_to_video(frames, "generated.mp4", fps=7)
-        ```
-"""
-

 def _append_dims(x, target_dims):
    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
@@ -58,7 +41,7 @@ def _append_dims(x, target_dims):


 # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
-def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
    batch_size, channels, num_frames, height, width = video.shape
    outputs = []
    for batch_idx in range(batch_size):
@@ -82,15 +65,15 @@ def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: s
@dataclass
 class StableVideoDiffusionPipelineOutput(BaseOutput):
    r"""
-    Output class for Stable Video Diffusion pipeline.
+    Output class for zero-shot text-to-video pipeline.

    Args:
-        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
-            List of denoised PIL images of length `batch_size` or numpy array or torch tensor
-            of shape `(batch_size, num_frames, height, width, num_channels)`.
+        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
    """

-    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
+    frames: Union[List[PIL.Image.Image], np.ndarray]


 class StableVideoDiffusionPipeline(DiffusionPipeline):
@@ -136,13 +119,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

-    def _encode_image(
-        self,
-        image: PipelineImageInput,
-        device: Union[str, torch.device],
-        num_videos_per_prompt: int,
-        do_classifier_free_guidance: bool,
-    ) -> torch.FloatTensor:
+    def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance):
        dtype = next(self.image_encoder.parameters()).dtype

        if not isinstance(image, torch.Tensor):
@@ -187,9 +164,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
    def _encode_vae_image(
        self,
        image: torch.Tensor,
-        device: Union[str, torch.device],
-        num_videos_per_prompt: int,
-        do_classifier_free_guidance: bool,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
    ):
        image = image.to(device=device)
        image_latents = self.vae.encode(image).latent_dist.mode()
@@ -209,13 +186,13 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

    def _get_add_time_ids(
        self,
-        fps: int,
-        motion_bucket_id: int,
-        noise_aug_strength: float,
-        dtype: torch.dtype,
-        batch_size: int,
-        num_videos_per_prompt: int,
-        do_classifier_free_guidance: bool,
+        fps,
+        motion_bucket_id,
+        noise_aug_strength,
+        dtype,
+        batch_size,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
    ):
        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]

@@ -235,7 +212,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

        return add_time_ids

-    def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
+    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
        latents = latents.flatten(0, 1)

@@ -280,15 +257,15 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

    def prepare_latents(
        self,
-        batch_size: int,
-        num_frames: int,
-        num_channels_latents: int,
-        height: int,
-        width: int,
-        dtype: torch.dtype,
-        device: Union[str, torch.device],
-        generator: torch.Generator,
-        latents: Optional[torch.FloatTensor] = None,
+        batch_size,
+        num_frames,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
    ):
        shape = (
            batch_size,
@@ -322,7 +299,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
    @property
    def do_classifier_free_guidance(self):
        if isinstance(self.guidance_scale, (int, float)):
-            return self.guidance_scale > 1
+            return self.guidance_scale
        return self.guidance_scale.max() > 1

    @property
@@ -330,7 +307,6 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        return self._num_timesteps

    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
@@ -357,16 +333,15 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

        Args:
            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
+                Image or images to guide image generation. If you provide a tensor, the expected value range is between `[0,1]`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_frames (`int`, *optional*):
-                The number of video frames to generate. Defaults to `self.unet.config.num_frames`
-                (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
            num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter is modulated by `strength`.
            min_guidance_scale (`float`, *optional*, defaults to 1.0):
                The minimum guidance scale. Used for the classifier free guidance with first frame.
@@ -376,29 +351,29 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
                Frames per second. The rate at which the generated images shall be exported to a video after generation.
                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
            motion_bucket_id (`int`, *optional*, defaults to 127):
-                Used for conditioning the amount of motion for the generation. The higher the number the more motion
-                will be in the video.
+                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
            noise_aug_strength (`float`, *optional*, defaults to 0.02):
                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
            decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
-                quality. For lower memory usage, reduce `decode_chunk_size`.
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of videos to generate per prompt.
+                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            callback_on_step_end (`Callable`, *optional*):
-                A function that is called at the end of each denoising step during inference. The function is called
-                with the following arguments:
-                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
-                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -407,12 +382,26 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.

-        Examples:
-
        Returns:
            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+
+        Examples:
+
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
@@ -440,7 +429,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        # 3. Encode input image
        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)

-        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
        fps = fps - 1

@@ -481,11 +471,11 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        )
        added_time_ids = added_time_ids.to(device)

-        # 6. Prepare timesteps
+        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps

-        # 7. Prepare latent variables
+        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_videos_per_prompt,
@@ -499,7 +489,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
            latents,
        )

-        # 8. Prepare guidance scale
+        # 7. Prepare guidance scale
        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
        guidance_scale = guidance_scale.to(device, latents.dtype)
        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
@@ -507,7 +497,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

        self._guidance_scale = guidance_scale

-        # 9. Denoising loop
+        # 8. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        self._num_timesteps = len(timesteps)
        with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -516,7 +506,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

-                # Concatenate image_latents over channels dimension
+                # Concatenate image_latents over channels dimention
                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)

                # predict the noise residual
@@ -563,22 +563,15 @@ class StableDiffusionXLAdapterPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -693,9 +686,9 @@ class StableDiffusionXLAdapterPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -52,7 +52,7 @@ EXAMPLE_DOC_STRING = """
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "Spiderman is surfing"
-        >>> video_frames = pipe(prompt).frames[0]
+        >>> video_frames = pipe(prompt).frames
        >>> video_path = export_to_video(video_frames)
        >>> video_path
        ```
@@ -52,7 +52,7 @@ EXAMPLE_DOC_STRING = """
        >>> pipe.to("cuda")

        >>> prompt = "spiderman running in the desert"
-        >>> video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
+        >>> video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames
        >>> # safe low-res video
        >>> video_path = export_to_video(video_frames, output_video_path="./video_576_spiderman.mp4")

@@ -73,7 +73,7 @@ EXAMPLE_DOC_STRING = """
        >>> video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]

        >>> # and denoise it
-        >>> video_frames = pipe(prompt, video=video, strength=0.6).frames[0]
+        >>> video_frames = pipe(prompt, video=video, strength=0.6).frames
        >>> video_path = export_to_video(video_frames, output_video_path="./video_1024_spiderman.mp4")
        >>> video_path
        ```
@@ -1,3 +1,18 @@
+# Copyright (c) 2023 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn as nn

@@ -233,7 +233,7 @@ class WuerstchenDiffNeXt(ModelMixin, ConfigMixin):


 class ResBlockStageB(nn.Module):
-    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+    def __init__(self, c, c_skip=None, kernel_size=3, dropout=0.0):
        super().__init__()
        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
@@ -349,11 +349,6 @@ class WuerstchenDecoderPipeline(DiffusionPipeline):
        text_encoder_hidden_states = (
            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
        )
-        effnet = (
-            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
-            if self.do_classifier_free_guidance
-            else image_embeddings
-        )

        # 3. Determine latent shape of latents
        latent_height = int(image_embeddings.size(2) * self.config.latent_dim_scale)
@@ -376,6 +371,11 @@ class WuerstchenDecoderPipeline(DiffusionPipeline):
        self._num_timesteps = len(timesteps[:-1])
        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
            ratio = t.expand(latents.size(0)).to(dtype)
+            effnet = (
+                torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
+                if self.do_classifier_free_guidance
+                else image_embeddings
+            )
            # 7. Denoise latents
            predicted_latents = self.decoder(
                torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
@@ -423,9 +423,9 @@ class WuerstchenDecoderPipeline(DiffusionPipeline):
            latents = self.vqgan.config.scale_factor * latents
            images = self.vqgan.decode(latents).sample.clamp(0, 1)
            if output_type == "np":
-                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+                images = images.permute(0, 2, 3, 1).cpu().numpy()
            elif output_type == "pil":
-                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+                images = images.permute(0, 2, 3, 1).cpu().numpy()
                images = self.numpy_to_pil(images)
        else:
            images = latents
@@ -508,7 +508,7 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin):
        self.maybe_free_model_hooks()

        if output_type == "np":
-            latents = latents.cpu().float().numpy()
+            latents = latents.cpu().numpy()

        if not return_dict:
            return (latents,)
@@ -65,7 +65,6 @@ else:
    _import_structure["scheduling_repaint"] = ["RePaintScheduler"]
    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
    _import_structure["scheduling_sde_ve"] = ["ScoreSdeVeScheduler"]
-    _import_structure["scheduling_tcd"] = ["TCDScheduler"]
    _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"]
    _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
    _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
@@ -160,7 +159,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .scheduling_repaint import RePaintScheduler
        from .scheduling_sasolver import SASolverScheduler
        from .scheduling_sde_ve import ScoreSdeVeScheduler
-        from .scheduling_tcd import TCDScheduler
        from .scheduling_unclip import UnCLIPScheduler
        from .scheduling_unipc_multistep import UniPCMultistepScheduler
        from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
--- a/Show More
+++ b/Show More