update

2024-03-01 04:49:56 +00:00
49 changed files with 179 additions and 1835 deletions
@@ -1,7 +1,6 @@
 name: Benchmarking tests

 on:
-  workflow_dispatch:
  schedule:
    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM

@@ -1,28 +0,0 @@
-name: Check for Broken Links
-
-on:
-  repository_dispatch:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-jobs:
-  check_for_broken_links:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Link Checker
-        id: lychee
-        uses: lycheeverse/lychee-action@v1
-        with:
-          args: './**/*.md'
-          fail: true
-
-      - name: Create Issue From File
-        if: env.lychee_exit_code != 0
-        uses: diffusers/create-issue-from-file@v4
-        with:
-          title: Link Checker Report
-          content-filepath: ./lychee/out.md
-          labels: report, automated issue
@@ -1,80 +0,0 @@
-name: Doctests
-
-on:
-  push:
-    branches:
-      - doctest*
-  repository_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-env:
-  HF_HOME: /mnt/cache
-  RUN_SLOW: yes
-  OMP_NUM_THREADS: 16
-  MKL_NUM_THREADS: 16
-
-jobs:
-  run_doctests:
-    runs-on: [single-gpu, nvidia-gpu, a10, ci]
-    container:
-      image: huggingface/diffusers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        uses: actions/checkout@v3
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: python3 -m pip install -e .[quality,test,training]
-
-      - name: Environment
-        run: |
-          python3 utils/print_env.py
-
-      - name: Get doctest files
-        run: |
-          $(python3 -c 'from utils.tests_fetcher import get_all_doctest_files; to_test = get_all_doctest_files(); to_test = " ".join(to_test); fp = open("doc_tests.txt", "w"); fp.write(to_test); fp.close()')
-
-      - name: Run doctests
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat reports/doc_tests_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: doc_tests_gpu_test_reports
-          path: reports/doc_tests_gpu
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-22.04
-    if: always()
-    needs: [run_doctests]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/download-artifact@v3
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }}
-          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }}
-          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
-        run: |
-          pip install slack_sdk
-          python utils/notification_service_doc_tests.py
@@ -36,7 +36,6 @@ repo-consistency:
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
-	python utils/check_doctest_list.py

 # this target runs checks on all files

@@ -68,7 +67,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
-	python utils/check_doctest_list.py --fix_and_overwrite

 # Run tests for the library

@@ -141,7 +141,6 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
        super().__init__(args)
        self.pipe.load_lora_weights(self.lora_id)
        self.pipe.fuse_lora()
-        self.pipe.unload_lora_weights()
        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)

    def get_result_filepath(self, args):
@@ -236,35 +235,6 @@ class InpaintingBenchmark(ImageToImageBenchmark):
        )


-class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
-    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
-    image = load_image(url)
-
-    def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
-        pipe.load_ip_adapter(
-            args.ip_adapter_id[0],
-            subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
-            weight_name=args.ip_adapter_id[1],
-        )
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            ip_adapter_image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
 class ControlNetBenchmark(TextToImageBenchmark):
    pipeline_class = StableDiffusionControlNetPipeline
    aux_network_class = ControlNetModel
@@ -1,32 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import IPAdapterTextToImageBenchmark  # noqa: E402
-
-
-IP_ADAPTER_CKPTS = {
-    "runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
-    "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
-}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        choices=list(IP_ADAPTER_CKPTS.keys()),
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
-    benchmark_pipe = IPAdapterTextToImageBenchmark(args)
-    args.ckpt = f"{args.ckpt} (IP-Adapter)"
-    benchmark_pipe.benchmark(args)
@@ -72,7 +72,7 @@ def main():
                command += " --run_compile"
                run_command(command.split())

-        elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
+        elif file == "benchmark_sd_inpainting.py":
            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
            command = f"python {file} --ckpt {sdxl_ckpt}"
            run_command(command.split())
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

@@ -24,9 +24,9 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
+        torch==2.1.2 \
+        torchvision==0.16.2 \
+        torchaudio==2.1.2 \
        "onnxruntime-gpu>=1.13.1" \
        --extra-index-url https://download.pytorch.org/whl/cu117 && \
    python3 -m uv pip install --no-cache-dir \
@@ -169,7 +169,7 @@ list_adapters_component_wise

 If you want to compile your model with `torch.compile` make sure to first fuse the LoRA weights into the base model and unload them.

-```diff
+```py
 pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
 pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")

@@ -178,16 +178,12 @@ pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
 pipe.fuse_lora()
 pipe.unload_lora_weights()

-+ pipe.unet.to(memory_format=torch.channels_last)
-+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+pipe = torch.compile(pipe)

 prompt = "toy_face of a hacker with a hoodie, pixel art"
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 ```

-> [!TIP]
-> You can refer to the `torch.compile()` section [here](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0#torchcompile) and [here](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) for more elaborate examples.
-
 ## Fusing adapters into the model

 You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
@@ -80,7 +80,8 @@ To do so, just specify `--train_text_encoder_ti` while launching training (for r
 Please keep the following points in mind:

 * SDXL has two text encoders. So, we fine-tune both using LoRA.
-* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
+* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memoםהקרry.
+

 ### 3D icon example

@@ -233,32 +234,6 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.

 SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).

-### DoRA training 
-The advanced script now supports DoRA training too!
-> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), 
-**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. 
-The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. 
-
-> [!NOTE]
-> 💡DoRA training is still _experimental_  
-> and is likely to require different hyperparameter values to perform best compared to a LoRA.
-> Specifically, we've noticed 2 differences to take into account your training: 
-> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
-> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.  
-> This is also aligned with some of the quantitative analysis shown in the paper. 
-
-**Usage**
-1. To use DoRA you need to install `peft` from main: 
-```bash
-pip install git+https://github.com/huggingface/peft.git
-```
-2. Enable DoRA training by adding this flag
-```bash
--use_dora
-```
-**Inference** 
-The inference is the same as if you train a regular LoRA 🤗
-

 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
@@ -651,16 +651,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument(
-        "--use_dora",
-        type=bool,
-        action="store_true",
-        default=False,
-        help=(
-            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
-            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
-        ),
-    )
    parser.add_argument(
        "--cache_latents",
        action="store_true",
@@ -1229,7 +1219,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
@@ -1241,7 +1230,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -661,16 +661,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument(
-        "--use_dora",
-        type=bool,
-        action="store_true",
-        default=False,
-        help=(
-            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
-            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
-        ),
-    )
    parser.add_argument(
        "--cache_latents",
        action="store_true",
@@ -1333,7 +1323,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
@@ -1345,7 +1334,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -206,40 +206,3 @@ You can explore the results from a couple of our internal experiments by checkin
 ## Running on a free-tier Colab Notebook

 Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb). 
-
-## Conducting EDM-style training
-
-It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364). 
-
-For the SDXL model, simple set:
-
-```diff
-+  --do_edm_style_training \
-```
-
-Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
-
-```bash
-accelerate launch train_dreambooth_lora_sdxl.py \
-  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
-  --instance_data_dir="dog" \
-  --output_dir="dog-playground-lora" \
-  --mixed_precision="fp16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --learning_rate=1e-4 \
-  --use_8bit_adam \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed="0" \
-  --push_to_hub
-```
-
-> [!CAUTION]
-> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-import safetensors
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class DreamBoothLoRASDXLWithEDM(ExamplesTestsAccelerate):
-    def test_dreambooth_lora_sdxl_with_edm(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --do_edm_style_training
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_playground(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-playground-v2-5-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
@@ -14,10 +14,8 @@
 # See the License for the specific language governing permissions and

 import argparse
-import contextlib
 import gc
 import itertools
-import json
 import logging
 import math
 import os
@@ -34,7 +32,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
@@ -52,8 +50,6 @@ from diffusers import (
    AutoencoderKL,
    DDPMScheduler,
    DPMSolverMultistepScheduler,
-    EDMEulerScheduler,
-    EulerDiscreteScheduler,
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
 )
@@ -80,20 +76,6 @@ check_min_version("0.27.0.dev0")
 logger = get_logger(__name__)


-def determine_scheduler_type(pretrained_model_name_or_path, revision):
-    model_index_filename = "model_index.json"
-    if os.path.isdir(pretrained_model_name_or_path):
-        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
-    else:
-        model_index = hf_hub_download(
-            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
-        )
-
-    with open(model_index, "r") as f:
-        scheduler_type = json.load(f)["scheduler"][1]
-    return scheduler_type
-
-
 def save_model_card(
    repo_id: str,
    images=None,
@@ -113,7 +95,7 @@ def save_model_card(
            )

    model_description = f"""
-# {'SDXL' if 'playgroundai' not in base_model else 'Playground'} LoRA DreamBooth - {repo_id}
+# SDXL LoRA DreamBooth - {repo_id}

 <Gallery />

@@ -137,17 +119,11 @@ Weights for this model are available in Safetensors format.

 [Download]({repo_id}/tree/main) them in the Files & versions tab.

-"""
-    if "playgroundai" in args.pretrained_model_name_or_path:
-        model_description += """\n
-## License
-
-Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md).
 """
    model_card = load_or_create_model_card(
        repo_id_or_path=repo_id,
        from_training=True,
-        license="openrail++" if "playgroundai" not in base_model else "playground-v2dot5-community",
+        license="openrail++",
        base_model=base_model,
        prompt=instance_prompt,
        model_description=model_description,
@@ -155,17 +131,15 @@ Please adhere to the licensing terms as described [here](https://huggingface.co/
    )
    tags = [
        "text-to-image",
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
        "text-to-image",
        "diffusers",
        "lora",
        "template:sd-lora",
    ]
-    if "playgroundai" in base_model:
-        tags.extend(["playground", "playground-diffusers"])
-    else:
-        tags.extend(["stable-diffusion-xl", "stable-diffusion-xl-diffusers"])
-
    model_card = populate_model_card(model_card, tags=tags)
+
    model_card.save(os.path.join(repo_folder, "README.md"))


@@ -185,29 +159,23 @@ def log_validation(
    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
    scheduler_args = {}

-    if not args.do_edm_style_training:
-        if "variance_type" in pipeline.scheduler.config:
-            variance_type = pipeline.scheduler.config.variance_type
+    if "variance_type" in pipeline.scheduler.config:
+        variance_type = pipeline.scheduler.config.variance_type

-            if variance_type in ["learned", "learned_range"]:
-                variance_type = "fixed_small"
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"

-            scheduler_args["variance_type"] = variance_type
+        scheduler_args["variance_type"] = variance_type

-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)

    pipeline = pipeline.to(accelerator.device)
    pipeline.set_progress_bar_config(disable=True)

    # run inference
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
-    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
-    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-    inference_ctx = (
-        contextlib.nullcontext() if "playgroundai" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
-    )

-    with inference_ctx:
+    with torch.cuda.amp.autocast():
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]

    for tracker in accelerator.trackers:
@@ -366,12 +334,6 @@ def parse_args(input_args=None):
            " `args.validation_prompt` multiple times: `args.num_validation_images`."
        ),
    )
-    parser.add_argument(
-        "--do_edm_style_training",
-        default=False,
-        action="store_true",
-        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
-    )
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -943,9 +905,6 @@ def main(args):
            " Please use `huggingface-cli login` to authenticate with the Hub."
        )

-    if args.do_edm_style_training and args.snr_gamma is not None:
-        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
-
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
@@ -1059,19 +1018,7 @@ def main(args):
    )

    # Load scheduler and models
-    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
-    if "EDM" in scheduler_type:
-        args.do_edm_style_training = True
-        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-        logger.info("Performing EDM-style training!")
-    elif args.do_edm_style_training:
-        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
-            args.pretrained_model_name_or_path, subfolder="scheduler"
-        )
-        logger.info("Performing EDM-style training!")
-    else:
-        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
    )
@@ -1089,12 +1036,6 @@ def main(args):
        revision=args.revision,
        variant=args.variant,
    )
-    latents_mean = latents_std = None
-    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
-        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
-    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
-        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
-
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
    )
@@ -1237,7 +1178,7 @@ def main(args):
            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)

            _set_state_dict_into_text_encoder(
-                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_one_
            )

        # Make sure the trainable params are in float32. This is again needed since the base models
@@ -1492,12 +1433,7 @@ def main(args):
    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
-        tracker_name = (
-            "dreambooth-lora-sd-xl"
-            if "playgroundai" not in args.pretrained_model_name_or_path
-            else "dreambooth-lora-playground"
-        )
-        accelerator.init_trackers(tracker_name, config=vars(args))
+        accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))

    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1549,18 +1485,6 @@ def main(args):
        disable=not accelerator.is_local_main_process,
    )

-    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
-        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
-        timesteps = timesteps.to(accelerator.device)
-
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < n_dim:
-            sigma = sigma.unsqueeze(-1)
-        return sigma
-
    for epoch in range(first_epoch, args.num_train_epochs):
        unet.train()
        if args.train_text_encoder:
@@ -1588,46 +1512,22 @@ def main(args):

                # Convert images to latent space
                model_input = vae.encode(pixel_values).latent_dist.sample()
-
-                if latents_mean is None and latents_std is None:
-                    model_input = model_input * vae.config.scaling_factor
-                    if args.pretrained_vae_model_name_or_path is None:
-                        model_input = model_input.to(weight_dtype)
-                else:
-                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
-                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
-                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
-                    model_input = model_input.to(dtype=weight_dtype)
+                model_input = model_input * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(model_input)
                bsz = model_input.shape[0]
-
                # Sample a random timestep for each image
-                if not args.do_edm_style_training:
-                    timesteps = torch.randint(
-                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                    )
-                    timesteps = timesteps.long()
-                else:
-                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
-                    # instead of discrete timesteps, so here we sample indices to get the noise levels
-                    # from `scheduler.timesteps`
-                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
-                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
-                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
-                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
-                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
-                if args.do_edm_style_training:
-                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
-                    if "EDM" in scheduler_type:
-                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
-                    else:
-                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)

                # time ids
                add_time_ids = torch.cat(
@@ -1651,7 +1551,7 @@ def main(args):
                    }
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
@@ -1670,43 +1570,18 @@ def main(args):
                    )
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
                        return_dict=False,
                    )[0]

-                weighting = None
-                if args.do_edm_style_training:
-                    # Similar to the input preconditioning, the model predictions are also preconditioned
-                    # on noised model inputs (before preconditioning) and the sigmas.
-                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
-                    if "EDM" in scheduler_type:
-                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
-                    else:
-                        if noise_scheduler.config.prediction_type == "epsilon":
-                            model_pred = model_pred * (-sigmas) + noisy_model_input
-                        elif noise_scheduler.config.prediction_type == "v_prediction":
-                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
-                                noisy_model_input / (sigmas**2 + 1)
-                            )
-                    # We are not doing weighting here because it tends result in numerical problems.
-                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-                    # There might be other alternatives for weighting as well:
-                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
-                    if "EDM" not in scheduler_type:
-                        weighting = (sigmas**-2.0).float()
-
                # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = model_input if args.do_edm_style_training else noise
+                    target = noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = (
-                        model_input
-                        if args.do_edm_style_training
-                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
-                    )
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

@@ -1716,28 +1591,10 @@ def main(args):
                    target, target_prior = torch.chunk(target, 2, dim=0)

                    # Compute prior loss
-                    if weighting is not None:
-                        prior_loss = torch.mean(
-                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
-                                target_prior.shape[0], -1
-                            ),
-                            1,
-                        )
-                        prior_loss = prior_loss.mean()
-                    else:
-                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")

                if args.snr_gamma is None:
-                    if weighting is not None:
-                        loss = torch.mean(
-                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
-                                target.shape[0], -1
-                            ),
-                            1,
-                        )
-                        loss = loss.mean()
-                    else:
-                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                else:
                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
@@ -1839,6 +1696,7 @@ def main(args):
                    variant=args.variant,
                    torch_dtype=weight_dtype,
                )
+
                pipeline_args = {"prompt": args.validation_prompt}

                images = log_validation(
@@ -35,7 +35,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
-from datasets import concatenate_datasets, load_dataset
+from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from torchvision import transforms
@@ -895,20 +895,14 @@ def main(args):
        # fingerprint used by the cache for the other processes to load the result
        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
        new_fingerprint = Hasher.hash(args)
-        new_fingerprint_for_vae = Hasher.hash(vae_path)
-        train_dataset_with_embeddings = train_dataset.map(
-            compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint
-        )
-        train_dataset_with_vae = train_dataset.map(
+        new_fingerprint_for_vae = Hasher.hash("vae")
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+        train_dataset = train_dataset.map(
            compute_vae_encodings_fn,
            batched=True,
            batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps,
            new_fingerprint=new_fingerprint_for_vae,
        )
-        precomputed_dataset = concatenate_datasets(
-            [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1
-        )
-        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)

    del text_encoders, tokenizers, vae
    gc.collect()
@@ -931,7 +925,7 @@ def main(args):

    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
-        precomputed_dataset,
+        train_dataset,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=args.train_batch_size,
@@ -982,7 +976,7 @@ def main(args):
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(precomputed_dataset)}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
@@ -2,6 +2,7 @@ accelerate>=0.16.0
 torchvision
 transformers>=4.25.1
 wandb
+huggingface-cli
 bitsandbytes
 deepspeed
 peft>=0.6.0
@@ -25,7 +25,3 @@ skip-magic-trailing-comma = false

 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
-
-[tool.pytest.ini_options]
-doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
-doctest_glob="**/*.md"
@@ -9,11 +9,11 @@ from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtAlphaPip

 ckpt_id = "PixArt-alpha/PixArt-alpha"
 # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125
-interpolation_scale = {256: 0.5, 512: 1, 1024: 2}
+interpolation_scale = {512: 1, 1024: 2}


 def main(args):
-    all_state_dict = torch.load(args.orig_ckpt_path, map_location="cpu")
+    all_state_dict = torch.load(args.orig_ckpt_path)
    state_dict = all_state_dict.pop("state_dict")
    converted_state_dict = {}

@@ -22,6 +22,7 @@ def main(args):
    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")

    # Caption projection.
+    converted_state_dict["caption_projection.y_embedding"] = state_dict.pop("y_embedder.y_embedding")
    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
@@ -154,7 +155,6 @@ def main(args):

    assert transformer.pos_embed.pos_embed is not None
    state_dict.pop("pos_embed")
-    state_dict.pop("y_embedder.y_embedding")
    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"

    num_model_params = sum(p.numel() for p in transformer.parameters())
@@ -187,7 +187,7 @@ if __name__ == "__main__":
        "--image_size",
        default=1024,
        type=int,
-        choices=[256, 512, 1024],
+        choices=[512, 1024],
        required=False,
        help="Image size of pretrained model, either 512 or 1024.",
    )
@@ -1,185 +0,0 @@
-import doctest
-import inspect
-import os
-import re
-from typing import Iterable
-
-from .utils import is_pytest_available
-
-
-if is_pytest_available():
-    from _pytest.doctest import (
-        Module,
-        _get_checker,
-        _get_continue_on_failure,
-        _get_runner,
-        _is_mocked,
-        _patch_unwrap_mock_aware,
-        get_optionflags,
-        import_path,
-    )
-    from _pytest.outcomes import skip
-    from pytest import DoctestItem
-else:
-    Module = object
-    DoctestItem = object
-
-"""
-The following contains utils to run the documentation tests without having to overwrite any files.
-
-The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
-made as a print would otherwise fail the corresonding line.
-
-To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
-"""
-
-
-def preprocess_string(string, skip_cuda_tests):
-    """Prepare a docstring or a `.md` file to be run by doctest.
-
-    The argument `string` would be the whole file content if it is a `.md` file. For a python file, it would be one of
-    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
-    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
-    `string`.
-    """
-    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:(?!```)[^])*?```)"
-    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
-    is_cuda_found = False
-    for i, codeblock in enumerate(codeblocks):
-        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
-            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
-        if (
-            (">>>" in codeblock or "..." in codeblock)
-            and re.search(r"cuda|to\(0\)|device=0", codeblock)
-            and skip_cuda_tests
-        ):
-            is_cuda_found = True
-            break
-
-    modified_string = ""
-    if not is_cuda_found:
-        modified_string = "".join(codeblocks)
-
-    return modified_string
-
-
-class HfDocTestParser(doctest.DocTestParser):
-    """
-    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
-    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
-    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
-
-    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
-    """
-
-    # This regular expression is used to find doctest examples in a
-    # string.  It defines three groups: `source` is the source code
-    # (including leading indentation and prompts); `indent` is the
-    # indentation of the first (PS1) line of the source code; and
-    # `want` is the expected output (including leading indentation).
-    # fmt: off
-    _EXAMPLE_RE = re.compile(r'''
-        # Source consists of a PS1 line followed by zero or more PS2 lines.
-        (?P<source>
-            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
-            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
-        \n?
-        # Want consists of any non-blank lines that do not start with PS1.
-        (?P<want> (?:(?![ ]*$)    # Not a blank line
-             (?![ ]*>>>)          # Not a line starting with PS1
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:\n|$)  # Match a new line or end of string
-          )*)
-        ''', re.MULTILINE | re.VERBOSE
-    )
-    # fmt: on
-
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-
-    def parse(self, string, name="<string>"):
-        """
-        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
-        calling `super().parse`
-        """
-        string = preprocess_string(string, self.skip_cuda_tests)
-        return super().parse(string, name)
-
-
-class HfDoctestModule(Module):
-    """
-    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
-    tests.
-    """
-
-    def collect(self) -> Iterable["DoctestItem"]:
-        class MockAwareDocTestFinder(doctest.DocTestFinder):
-            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
-
-            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
-            """
-
-            def _find_lineno(self, obj, source_lines):
-                """Doctest code does not take into account `@property`, this
-                is a hackish way to fix it. https://bugs.python.org/issue17446
-
-                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
-                reported upstream. #8796
-                """
-                if isinstance(obj, property):
-                    obj = getattr(obj, "fget", obj)
-
-                if hasattr(obj, "__wrapped__"):
-                    # Get the main obj in case of it being wrapped
-                    obj = inspect.unwrap(obj)
-
-                # Type ignored because this is a private function.
-                return super()._find_lineno(  # type:ignore[misc]
-                    obj,
-                    source_lines,
-                )
-
-            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
-                if _is_mocked(obj):
-                    return
-                with _patch_unwrap_mock_aware():
-                    # Type ignored because this is a private function.
-                    super()._find(  # type:ignore[misc]
-                        tests, obj, name, module, source_lines, globs, seen
-                    )
-
-        if self.path.name == "conftest.py":
-            module = self.config.pluginmanager._importconftest(
-                self.path,
-                self.config.getoption("importmode"),
-                rootpath=self.config.rootpath,
-            )
-        else:
-            try:
-                module = import_path(
-                    self.path,
-                    root=self.config.rootpath,
-                    mode=self.config.getoption("importmode"),
-                )
-            except ImportError:
-                if self.config.getvalue("doctest_ignore_import_errors"):
-                    skip("unable to import module %r" % self.path)
-                else:
-                    raise
-
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        optionflags = get_optionflags(self)
-        runner = _get_runner(
-            verbose=False,
-            optionflags=optionflags,
-            checker=_get_checker(),
-            continue_on_failure=_get_continue_on_failure(self.config),
-        )
-        for test in finder.find(module, module.__name__):
-            if test.examples:  # skip empty doctests and cuda
-                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
@@ -97,7 +97,6 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        norm_eps: float = 1e-5,
        attention_type: str = "default",
        caption_channels: int = None,
-        interpolation_scale: float = None,
    ):
        super().__init__()
        self.use_linear_projection = use_linear_projection
@@ -169,9 +168,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
            self.width = sample_size

            self.patch_size = patch_size
-            interpolation_scale = (
-                interpolation_scale if interpolation_scale is not None else max(self.config.sample_size // 64, 1)
-            )
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
            self.pos_embed = PatchEmbed(
                height=sample_size,
                width=sample_size,
@@ -400,22 +400,15 @@ class AnimateDiffPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -516,9 +509,9 @@ class AnimateDiffPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
@@ -478,22 +478,15 @@ class AnimateDiffVideoToVideoPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -596,9 +589,9 @@ class AnimateDiffVideoToVideoPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
@@ -510,22 +510,15 @@ class StableDiffusionControlNetPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -733,9 +726,9 @@ class StableDiffusionControlNetPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def check_image(self, image, prompt, prompt_embeds):
@@ -503,22 +503,15 @@ class StableDiffusionControlNetImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -720,9 +713,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -628,22 +628,15 @@ class StableDiffusionControlNetInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -878,9 +871,9 @@ class StableDiffusionControlNetInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -537,22 +537,15 @@ class StableDiffusionXLControlNetInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -824,9 +817,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_control_image(
@@ -515,22 +515,15 @@ class StableDiffusionXLControlNetPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -737,9 +730,9 @@ class StableDiffusionXLControlNetPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
@@ -567,22 +567,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -801,9 +794,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
@@ -453,22 +453,15 @@ class LatentConsistencyModelImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -654,9 +647,9 @@ class LatentConsistencyModelImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    @property
@@ -437,22 +437,15 @@ class LatentConsistencyModelPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -586,9 +579,9 @@ class LatentConsistencyModelPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    @property
@@ -582,9 +582,9 @@ class PIAPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
@@ -619,22 +619,15 @@ class PIAPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -133,42 +133,6 @@ ASPECT_RATIO_512_BIN = {
    "4.0": [1024.0, 256.0],
 }

-ASPECT_RATIO_256_BIN = {
-    "0.25": [128.0, 512.0],
-    "0.28": [128.0, 464.0],
-    "0.32": [144.0, 448.0],
-    "0.33": [144.0, 432.0],
-    "0.35": [144.0, 416.0],
-    "0.4": [160.0, 400.0],
-    "0.42": [160.0, 384.0],
-    "0.48": [176.0, 368.0],
-    "0.5": [176.0, 352.0],
-    "0.52": [176.0, 336.0],
-    "0.57": [192.0, 336.0],
-    "0.6": [192.0, 320.0],
-    "0.68": [208.0, 304.0],
-    "0.72": [208.0, 288.0],
-    "0.78": [224.0, 288.0],
-    "0.82": [224.0, 272.0],
-    "0.88": [240.0, 272.0],
-    "0.94": [240.0, 256.0],
-    "1.0": [256.0, 256.0],
-    "1.07": [256.0, 240.0],
-    "1.13": [272.0, 240.0],
-    "1.21": [272.0, 224.0],
-    "1.29": [288.0, 224.0],
-    "1.38": [288.0, 208.0],
-    "1.46": [304.0, 208.0],
-    "1.67": [320.0, 192.0],
-    "1.75": [336.0, 192.0],
-    "2.0": [352.0, 176.0],
-    "2.09": [368.0, 176.0],
-    "2.4": [384.0, 160.0],
-    "2.5": [400.0, 160.0],
-    "3.0": [432.0, 144.0],
-    "4.0": [512.0, 128.0],
-}
-

 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
@@ -296,7 +260,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        prompt_attention_mask: Optional[torch.FloatTensor] = None,
        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
        clean_caption: bool = False,
-        max_sequence_length: int = 120,
        **kwargs,
    ):
        r"""
@@ -321,9 +284,8 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
                string.
-            clean_caption (`bool`, defaults to `False`):
+            clean_caption (bool, defaults to `False`):
                If `True`, the function will preprocess and clean the provided caption before encoding.
-            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
        """

        if "mask_feature" in kwargs:
@@ -341,7 +303,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            batch_size = prompt_embeds.shape[0]

        # See Section 3.1. of the paper.
-        max_length = max_sequence_length
+        max_length = 120

        if prompt_embeds is None:
            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
@@ -726,7 +688,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        callback_steps: int = 1,
        clean_caption: bool = True,
        use_resolution_binning: bool = True,
-        max_sequence_length: int = 120,
        **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        """
@@ -796,7 +757,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
                If set to `True`, the requested height and width are first mapped to the closest resolutions using
                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
                the requested resolution. Useful for generating non-square images.
-            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.

        Examples:

@@ -812,14 +772,9 @@ class PixArtAlphaPipeline(DiffusionPipeline):
        height = height or self.transformer.config.sample_size * self.vae_scale_factor
        width = width or self.transformer.config.sample_size * self.vae_scale_factor
        if use_resolution_binning:
-            if self.transformer.config.sample_size == 128:
-                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
-            elif self.transformer.config.sample_size == 64:
-                aspect_ratio_bin = ASPECT_RATIO_512_BIN
-            elif self.transformer.config.sample_size == 32:
-                aspect_ratio_bin = ASPECT_RATIO_256_BIN
-            else:
-                raise ValueError("Invalid sample size")
+            aspect_ratio_bin = (
+                ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN
+            )
            orig_height, orig_width = height, width
            height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)

@@ -867,7 +822,6 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            prompt_attention_mask=prompt_attention_mask,
            negative_prompt_attention_mask=negative_prompt_attention_mask,
            clean_caption=clean_caption,
-            max_sequence_length=max_sequence_length,
        )
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
@@ -520,22 +520,15 @@ class StableDiffusionPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -646,9 +639,9 @@ class StableDiffusionPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
@@ -564,22 +564,15 @@ class StableDiffusionImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -692,9 +685,9 @@ class StableDiffusionImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, strength, device):
@@ -636,22 +636,15 @@ class StableDiffusionInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -774,9 +767,9 @@ class StableDiffusionInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(
@@ -59,66 +59,6 @@ EXAMPLE_DOC_STRING = """
 """


-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
-                must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
@dataclass
 class LDM3DPipelineOutput(BaseOutput):
    """
@@ -185,7 +125,6 @@ class StableDiffusionLDM3DPipeline(
    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
@@ -503,22 +442,15 @@ class StableDiffusionLDM3DPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -621,9 +553,9 @@ class StableDiffusionLDM3DPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
@@ -643,66 +575,6 @@ class StableDiffusionLDM3DPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        assert len(w.shape) == 1
-        w = w * 1000.0
-
-        half_dim = embedding_dim // 2
-        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
-        emb = w.to(dtype)[:, None] * emb[None, :]
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = torch.nn.functional.pad(emb, (0, 1))
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def guidance_rescale(self):
-        return self._guidance_rescale
-
-    @property
-    def clip_skip(self):
-        return self._clip_skip
-
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-
-    @property
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
@@ -711,7 +583,6 @@ class StableDiffusionLDM3DPipeline(
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 49,
-        timesteps: List[int] = None,
        guidance_scale: float = 5.0,
        negative_prompt: Optional[Union[str, List[str]]] = None,
        num_images_per_prompt: Optional[int] = 1,
@@ -724,12 +595,10 @@ class StableDiffusionLDM3DPipeline(
        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -780,21 +649,18 @@ class StableDiffusionLDM3DPipeline(
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
        Examples:

        Returns:
@@ -804,22 +670,6 @@ class StableDiffusionLDM3DPipeline(
                second element is a list of `bool`s indicating whether the corresponding generated image contains
                "not-safe-for-work" (nsfw) content.
        """
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -835,15 +685,8 @@ class StableDiffusionLDM3DPipeline(
            negative_prompt_embeds,
            ip_adapter_image,
            ip_adapter_image_embeds,
-            callback_on_step_end_tensor_inputs,
        )

-        self._guidance_scale = guidance_scale
-        self._guidance_rescale = guidance_rescale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
@@ -853,6 +696,10 @@ class StableDiffusionLDM3DPipeline(
            batch_size = prompt_embeds.shape[0]

        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0

        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
            image_embeds = self.prepare_ip_adapter_image_embeds(
@@ -860,7 +707,7 @@ class StableDiffusionLDM3DPipeline(
                ip_adapter_image_embeds,
                device,
                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
+                do_classifier_free_guidance,
            )

        # 3. Encode input prompt
@@ -868,7 +715,7 @@ class StableDiffusionLDM3DPipeline(
            prompt,
            device,
            num_images_per_prompt,
-            self.do_classifier_free_guidance,
+            do_classifier_free_guidance,
            negative_prompt,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
@@ -877,11 +724,12 @@ class StableDiffusionLDM3DPipeline(
        # For classifier free guidance, we need to do two forward passes.
        # Here we concatenate the unconditional and text embeddings into a single batch
        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
+        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
@@ -902,24 +750,12 @@ class StableDiffusionLDM3DPipeline(
        # 6.1 Add image embeds for IP-Adapter
        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

-        # 6.2 Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
        # 7. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
@@ -927,34 +763,19 @@ class StableDiffusionLDM3DPipeline(
                    latent_model_input,
                    t,
                    encoder_hidden_states=prompt_embeds,
-                    timestep_cond=timestep_cond,
                    cross_attention_kwargs=cross_attention_kwargs,
                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

                # perform guidance
-                if self.do_classifier_free_guidance:
+                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

-                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
-
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
@@ -414,22 +414,15 @@ class StableDiffusionPanoramaPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -557,9 +550,9 @@ class StableDiffusionPanoramaPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -549,22 +549,15 @@ class StableDiffusionXLPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -678,9 +671,9 @@ class StableDiffusionXLPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -616,9 +616,9 @@ class StableDiffusionXLImg2ImgPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
@@ -782,22 +782,15 @@ class StableDiffusionXLImg2ImgPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -486,22 +486,15 @@ class StableDiffusionXLInpaintPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -858,9 +851,9 @@ class StableDiffusionXLInpaintPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    def prepare_latents(
@@ -563,22 +563,15 @@ class StableDiffusionXLAdapterPipeline(

                image_embeds.append(single_image_embeds)
        else:
-            repeat_dims = [1]
            image_embeds = []
            for single_image_embeds in ip_adapter_image_embeds:
                if do_classifier_free_guidance:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
-                    single_negative_image_embeds = single_negative_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
-                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1)
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                else:
-                    single_image_embeds = single_image_embeds.repeat(
-                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
-                    )
+                    single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1)
                image_embeds.append(single_image_embeds)

        return image_embeds
@@ -693,9 +686,9 @@ class StableDiffusionXLAdapterPipeline(
                raise ValueError(
                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+            elif ip_adapter_image_embeds[0].ndim != 3:
                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                    f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                )

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -69,7 +69,6 @@ from .import_utils import (
    is_note_seq_available,
    is_onnx_available,
    is_peft_available,
-    is_pytest_available,
    is_scipy_available,
    is_tensorboard_available,
    is_torch_available,
@@ -278,13 +278,6 @@ try:
 except importlib_metadata.PackageNotFoundError:
    _peft_available = False

-_pytest_available = importlib.util.find_spec("pytest") is not None
-try:
-    _pytest_version = importlib_metadata.version("pytest")
-    logger.debug(f"Successfully imported pytest version {_pytest_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pytest_available = False
-
 _torchvision_available = importlib.util.find_spec("torchvision") is not None
 try:
    _torchvision_version = importlib_metadata.version("torchvision")
@@ -381,10 +374,6 @@ def is_peft_available():
    return _peft_available


-def is_pytest_available():
-    return _pytest_available
-
-
 def is_torchvision_available():
    return _torchvision_available

@@ -779,7 +779,7 @@ class CaptureLogger:
    >>> logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
    >>> with CaptureLogger(logger) as cl:
    ...     logger.info(msg)
-    >>> assert cl.out, msg + \n
+    >>> assert cl.out, msg + "\n"
    ```
    """

@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This script is responsible for cleaning the list of doctests by making sure the entries all exist and are in
-alphabetical order.
-
-Usage (from the root of the repo):
-
-Check that the doctest list is properly sorted and all files exist (used in `make repo-consistency`):
-
-```bash
-python utils/check_doctest_list.py
-```
-
-Auto-sort the doctest list if it is not properly sorted (used in `make fix-copies`):
-
-```bash
-python utils/check_doctest_list.py --fix_and_overwrite
-```
-"""
-import argparse
-import os
-
-
-# All paths are set with the intent you should run this script from the root of the repo with the command
-# python utils/check_doctest_list.py
-REPO_PATH = "."
-DOCTEST_FILE_PATHS = ["not_doctested.txt"]
-
-
-def clean_doctest_list(doctest_file: str, overwrite: bool = False):
-    """
-    Cleans the doctest in a given file.
-
-    Args:
-        doctest_file (`str`):
-            The path to the doctest file to check or clean.
-        overwrite (`bool`, *optional*, defaults to `False`):
-            Whether or not to fix problems. If `False`, will error when the file is not clean.
-    """
-    non_existent_paths = []
-    all_paths = []
-    with open(doctest_file, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip().split(" ")[0]
-            path = os.path.join(REPO_PATH, line)
-            if not (os.path.isfile(path) or os.path.isdir(path)):
-                non_existent_paths.append(line)
-            all_paths.append(line)
-
-    if len(non_existent_paths) > 0:
-        non_existent_paths = "\n".join([f"- {f}" for f in non_existent_paths])
-        raise ValueError(f"`{doctest_file}` contains non-existent paths:\n{non_existent_paths}")
-
-    sorted_paths = sorted(all_paths)
-    if all_paths != sorted_paths:
-        if not overwrite:
-            raise ValueError(
-                f"Files in `{doctest_file}` are not in alphabetical order, run `make fix-copies` to fix "
-                "this automatically."
-            )
-        with open(doctest_file, "w", encoding="utf-8") as f:
-            f.write("\n".join(sorted_paths) + "\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
-    args = parser.parse_args()
-
-    for doctest_file in DOCTEST_FILE_PATHS:
-        doctest_file = os.path.join(REPO_PATH, "utils", doctest_file)
-        clean_doctest_list(doctest_file, args.fix_and_overwrite)
@@ -1,100 +0,0 @@
-docs/source/en/training/create_dataset.md
-docs/source/en/training/wuerstchen.md
-docs/source/en/training/adapt_a_model.md
-docs/source/en/training/text2image.md
-docs/source/en/training/custom_diffusion.md
-docs/source/en/training/sdxl.md
-docs/source/en/training/unconditional_training.md
-docs/source/en/training/overview.md
-docs/source/en/training/t2i_adapters.md
-docs/source/en/training/lcm_distill.md
-docs/source/en/training/instructpix2pix.md
-docs/source/en/training/kandinsky.md
-docs/source/en/training/lora.md
-docs/source/en/training/controlnet.md
-docs/source/en/training/dreambooth.md
-docs/source/en/training/ddpo.md
-docs/source/en/training/text_inversion.md
-docs/source/en/training/distributed_inference.md
-docs/source/en/optimization/torch2.0.md
-docs/source/en/optimization/coreml.md
-docs/source/en/optimization/tome.md
-docs/source/en/optimization/xformers.md
-docs/source/en/optimization/deepcache.md
-docs/source/en/optimization/fp16.md
-docs/source/en/optimization/memory.md
-docs/source/en/optimization/habana.md
-docs/source/en/optimization/open_vino.md
-docs/source/en/optimization/mps.md
-docs/source/en/optimization/opt_overview.md
-docs/source/en/optimization/onnx.md
-docs/source/en/tutorials/basic_training.md
-docs/source/ko/index.md
-docs/source/ko/quicktour.md
-docs/source/ko/in_translation.md
-docs/source/ko/installation.md
-docs/source/ko/stable_diffusion.md
-docs/source/ko/training/create_dataset.md
-docs/source/ko/training/wuerstchen.md
-docs/source/ko/training/adapt_a_model.md
-docs/source/ko/training/text2image.md
-docs/source/ko/training/custom_diffusion.md
-docs/source/ko/training/sdxl.md
-docs/source/ko/training/unconditional_training.md
-docs/source/ko/training/overview.md
-docs/source/ko/training/t2i_adapters.md
-docs/source/ko/training/lcm_distill.md
-docs/source/ko/training/instructpix2pix.md
-docs/source/ko/training/kandinsky.md
-docs/source/ko/training/lora.md
-docs/source/ko/training/controlnet.md
-docs/source/ko/training/dreambooth.md
-docs/source/ko/training/ddpo.md
-docs/source/ko/training/text_inversion.md
-docs/source/ko/training/distributed_inference.md
-docs/source/ko/optimization/torch2.0.md
-docs/source/ko/optimization/coreml.md
-docs/source/ko/optimization/tome.md
-docs/source/ko/optimization/xformers.md
-docs/source/ko/optimization/deepcache.md
-docs/source/ko/optimization/fp16.md
-docs/source/ko/optimization/memory.md
-docs/source/ko/optimization/habana.md
-docs/source/ko/optimization/open_vino.md
-docs/source/ko/optimization/mps.md
-docs/source/ko/optimization/opt_overview.md
-docs/source/ko/optimization/onnx.md
-docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md
-docs/source/ko/tutorials/basic_training.md
-docs/source/ko/using-diffusers/loading.md
-docs/source/ko/using-diffusers/unconditional_image_generation.md
-docs/source/ko/using-diffusers/depth2img.md
-docs/source/ko/using-diffusers/control_brightness.md
-docs/source/ko/using-diffusers/contribute_pipeline.md
-docs/source/ko/using-diffusers/img2img.md
-docs/source/ko/using-diffusers/weighted_prompts.md
-docs/source/ko/using-diffusers/schedulers.md
-docs/source/ko/using-diffusers/custom_pipeline_examples.md
-docs/source/ko/using-diffusers/using_safetensors.md
-docs/source/ko/using-diffusers/reproducibility.md
-docs/source/ko/using-diffusers/inpaint.md
-docs/source/ko/using-diffusers/conditional_image_generation.md
-docs/source/ko/using-diffusers/controlling_generation.md
-docs/source/ko/using-diffusers/reusing_seeds.md
-docs/source/ko/using-diffusers/textual_inversion_inference.md
-docs/source/ko/using-diffusers/loading_overview.md
-docs/source/ko/using-diffusers/custom_pipeline_overview.md
-docs/source/ko/using-diffusers/other-formats.md
-docs/source/ko/using-diffusers/stable_diffusion_jax_how_to.md
-docs/source/ko/using-diffusers/pipeline_overview.md
-docs/source/ko/using-diffusers/write_own_pipeline.md
-docs/source/pt/index.md
-docs/source/pt/quicktour.md
-docs/source/pt/in_translation.md
-docs/source/pt/installation.md
-docs/source/pt/stable_diffusion.md
-docs/source/ja/index.md
-docs/source/ja/quicktour.md
-docs/source/ja/in_translation.md
-docs/source/ja/installation.md
-docs/source/ja/stable_diffusion.md
@@ -1,401 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import json
-import math
-import os
-import re
-import time
-from fnmatch import fnmatch
-from typing import Dict, List
-
-import requests
-from slack_sdk import WebClient
-
-
-client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
-
-
-def handle_test_results(test_results):
-    expressions = test_results.split(" ")
-
-    failed = 0
-    success = 0
-
-    # When the output is short enough, the output is surrounded by = signs: "== OUTPUT =="
-    # When it is too long, those signs are not present.
-    time_spent = expressions[-2] if "=" in expressions[-1] else expressions[-1]
-
-    for i, expression in enumerate(expressions):
-        if "failed" in expression:
-            failed += int(expressions[i - 1])
-        if "passed" in expression:
-            success += int(expressions[i - 1])
-
-    return failed, success, time_spent
-
-
-def extract_first_line_failure(failures_short_lines):
-    failures = {}
-    file = None
-    in_error = False
-    for line in failures_short_lines.split("\n"):
-        if re.search(r"_ \[doctest\]", line):
-            in_error = True
-            file = line.split(" ")[2]
-        elif in_error and not line.split(" ")[0].isdigit():
-            failures[file] = line
-            in_error = False
-
-    return failures
-
-
-class Message:
-    def __init__(self, title: str, doc_test_results: Dict):
-        self.title = title
-
-        self._time_spent = doc_test_results["time_spent"].split(",")[0]
-        self.n_success = doc_test_results["success"]
-        self.n_failures = doc_test_results["failures"]
-        self.n_tests = self.n_success + self.n_failures
-
-        # Failures and success of the modeling tests
-        self.doc_test_results = doc_test_results
-
-    @property
-    def time(self) -> str:
-        time_spent = [self._time_spent]
-        total_secs = 0
-
-        for time in time_spent:
-            time_parts = time.split(":")
-
-            # Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute.
-            if len(time_parts) == 1:
-                time_parts = [0, 0, time_parts[0]]
-
-            hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2])
-            total_secs += hours * 3600 + minutes * 60 + seconds
-
-        hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60
-        return f"{int(hours)}h{int(minutes)}m{int(seconds)}s"
-
-    @property
-    def header(self) -> Dict:
-        return {"type": "header", "text": {"type": "plain_text", "text": self.title}}
-
-    @property
-    def no_failures(self) -> Dict:
-        return {
-            "type": "section",
-            "text": {
-                "type": "plain_text",
-                "text": f"🌞 There were no failures: all {self.n_tests} tests passed. The suite ran in {self.time}.",
-                "emoji": True,
-            },
-            "accessory": {
-                "type": "button",
-                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-            },
-        }
-
-    @property
-    def failures(self) -> Dict:
-        return {
-            "type": "section",
-            "text": {
-                "type": "plain_text",
-                "text": (
-                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\nThe suite ran in"
-                    f" {self.time}."
-                ),
-                "emoji": True,
-            },
-            "accessory": {
-                "type": "button",
-                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-            },
-        }
-
-    @property
-    def category_failures(self) -> List[Dict]:
-        failure_blocks = []
-
-        MAX_ERROR_TEXT = 3000 - len("The following examples had failures:\n\n\n\n") - len("[Truncated]\n")
-        line_length = 40
-        category_failures = {k: v["failed"] for k, v in doc_test_results.items() if isinstance(v, dict)}
-
-        def single_category_failures(category, failures):
-            text = ""
-            if len(failures) == 0:
-                return ""
-            text += f"*{category} failures*:".ljust(line_length // 2).rjust(line_length // 2) + "\n"
-
-            for idx, failure in enumerate(failures):
-                new_text = text + f"`{failure}`\n"
-                if len(new_text) > MAX_ERROR_TEXT:
-                    text = text + "[Truncated]\n"
-                    break
-                text = new_text
-
-            return text
-
-        for category, failures in category_failures.items():
-            report = single_category_failures(category, failures)
-            if len(report) == 0:
-                continue
-            block = {
-                "type": "section",
-                "text": {
-                    "type": "mrkdwn",
-                    "text": f"The following examples had failures:\n\n\n{report}\n",
-                },
-            }
-            failure_blocks.append(block)
-
-        return failure_blocks
-
-    @property
-    def payload(self) -> str:
-        blocks = [self.header]
-
-        if self.n_failures > 0:
-            blocks.append(self.failures)
-
-        if self.n_failures > 0:
-            blocks.extend(self.category_failures)
-
-        if self.n_failures == 0:
-            blocks.append(self.no_failures)
-
-        return json.dumps(blocks)
-
-    @staticmethod
-    def error_out():
-        payload = [
-            {
-                "type": "section",
-                "text": {
-                    "type": "plain_text",
-                    "text": "There was an issue running the tests.",
-                },
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                    "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-                },
-            }
-        ]
-
-        print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(payload)}))
-
-        client.chat_postMessage(
-            channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"],
-            text="There was an issue running the tests.",
-            blocks=payload,
-        )
-
-    def post(self):
-        print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(self.payload)}))
-
-        text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
-
-        self.thread_ts = client.chat_postMessage(
-            channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"],
-            blocks=self.payload,
-            text=text,
-        )
-
-    def get_reply_blocks(self, job_name, job_link, failures, text):
-        # `text` must be less than 3001 characters in Slack SDK
-        # keep some room for adding "[Truncated]" when necessary
-        MAX_ERROR_TEXT = 3000 - len("[Truncated]")
-
-        failure_text = ""
-        for key, value in failures.items():
-            new_text = failure_text + f"*{key}*\n_{value}_\n\n"
-            if len(new_text) > MAX_ERROR_TEXT:
-                # `failure_text` here has length <= 3000
-                failure_text = failure_text + "[Truncated]"
-                break
-            # `failure_text` here has length <= MAX_ERROR_TEXT
-            failure_text = new_text
-
-        title = job_name
-        content = {"type": "section", "text": {"type": "mrkdwn", "text": text}}
-
-        if job_link is not None:
-            content["accessory"] = {
-                "type": "button",
-                "text": {"type": "plain_text", "text": "GitHub Action job", "emoji": True},
-                "url": job_link,
-            }
-
-        return [
-            {"type": "header", "text": {"type": "plain_text", "text": title.upper(), "emoji": True}},
-            content,
-            {"type": "section", "text": {"type": "mrkdwn", "text": failure_text}},
-        ]
-
-    def post_reply(self):
-        if self.thread_ts is None:
-            raise ValueError("Can only post reply if a post has been made.")
-
-        job_link = self.doc_test_results.pop("job_link")
-        self.doc_test_results.pop("failures")
-        self.doc_test_results.pop("success")
-        self.doc_test_results.pop("time_spent")
-
-        sorted_dict = sorted(self.doc_test_results.items(), key=lambda t: t[0])
-        for job, job_result in sorted_dict:
-            if len(job_result["failures"]):
-                text = f"*Num failures* :{len(job_result['failed'])} \n"
-                failures = job_result["failures"]
-                blocks = self.get_reply_blocks(job, job_link, failures, text=text)
-
-                print("Sending the following reply")
-                print(json.dumps({"blocks": blocks}))
-
-                client.chat_postMessage(
-                    channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"],
-                    text=f"Results for {job}",
-                    blocks=blocks,
-                    thread_ts=self.thread_ts["ts"],
-                )
-
-                time.sleep(1)
-
-
-def get_job_links():
-    run_id = os.environ["GITHUB_RUN_ID"]
-    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{run_id}/jobs?per_page=100"
-    result = requests.get(url).json()
-    jobs = {}
-
-    try:
-        jobs.update({job["name"]: job["html_url"] for job in result["jobs"]})
-        pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)
-
-        for i in range(pages_to_iterate_over):
-            result = requests.get(url + f"&page={i + 2}").json()
-            jobs.update({job["name"]: job["html_url"] for job in result["jobs"]})
-
-        return jobs
-    except Exception as e:
-        print("Unknown error, could not fetch links.", e)
-
-    return {}
-
-
-def retrieve_artifact(name: str):
-    _artifact = {}
-
-    if os.path.exists(name):
-        files = os.listdir(name)
-        for file in files:
-            try:
-                with open(os.path.join(name, file), encoding="utf-8") as f:
-                    _artifact[file.split(".")[0]] = f.read()
-            except UnicodeDecodeError as e:
-                raise ValueError(f"Could not open {os.path.join(name, file)}.") from e
-
-    return _artifact
-
-
-def retrieve_available_artifacts():
-    class Artifact:
-        def __init__(self, name: str):
-            self.name = name
-            self.paths = []
-
-        def __str__(self):
-            return self.name
-
-        def add_path(self, path: str):
-            self.paths.append({"name": self.name, "path": path})
-
-    _available_artifacts: Dict[str, Artifact] = {}
-
-    directories = filter(os.path.isdir, os.listdir())
-    for directory in directories:
-        artifact_name = directory
-        if artifact_name not in _available_artifacts:
-            _available_artifacts[artifact_name] = Artifact(artifact_name)
-
-            _available_artifacts[artifact_name].add_path(directory)
-
-    return _available_artifacts
-
-
-if __name__ == "__main__":
-    github_actions_job_links = get_job_links()
-    available_artifacts = retrieve_available_artifacts()
-
-    docs = collections.OrderedDict(
-        [
-            ("*.py", "API Examples"),
-            ("*.md", "MD Examples"),
-        ]
-    )
-
-    # This dict will contain all the information relative to each doc test category:
-    # - failed: list of failed tests
-    # - failures: dict in the format 'test': 'error_message'
-    doc_test_results = {
-        v: {
-            "failed": [],
-            "failures": {},
-        }
-        for v in docs.values()
-    }
-
-    # Link to the GitHub Action job
-    doc_test_results["job_link"] = github_actions_job_links.get("run_doctests")
-
-    artifact_path = available_artifacts["doc_tests_gpu_test_reports"].paths[0]
-    artifact = retrieve_artifact(artifact_path["name"])
-    if "stats" in artifact:
-        failed, success, time_spent = handle_test_results(artifact["stats"])
-        doc_test_results["failures"] = failed
-        doc_test_results["success"] = success
-        doc_test_results["time_spent"] = time_spent[1:-1] + ", "
-
-        all_failures = extract_first_line_failure(artifact["failures_short"])
-        for line in artifact["summary_short"].split("\n"):
-            if re.search("FAILED", line):
-                line = line.replace("FAILED ", "")
-                line = line.split()[0].replace("\n", "")
-
-                if "::" in line:
-                    file_path, test = line.split("::")
-                else:
-                    file_path, test = line, line
-
-                for file_regex in docs.keys():
-                    if fnmatch(file_path, file_regex):
-                        category = docs[file_regex]
-                        doc_test_results[category]["failed"].append(test)
-
-                        failure = all_failures[test] if test in all_failures else "N/A"
-                        doc_test_results[category]["failures"][test] = failure
-                        break
-
-    message = Message("🤗 Results of the doc tests.", doc_test_results)
-    message.post()
-    message.post_reply()
@@ -39,7 +39,7 @@ def main():
    open_issues = repo.get_issues(state="open")

    for issue in open_issues:
-        labels = [label.name.lower() for label in issue.get_labels()]
+        labels = [label.name for label in issue.get_labels()]
        if "stale" in labels:
            comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
            last_comment = comments[0] if len(comments) > 0 else None
@@ -50,7 +50,7 @@ def main():
        elif (
            (dt.now(timezone.utc) - issue.updated_at).days > 23
            and (dt.now(timezone.utc) - issue.created_at).days >= 30
-            and not any(label in LABELS_TO_EXEMPT for label in labels)
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in labels)
        ):
            # Post a Stalebot notification after 23 days of inactivity.
            issue.create_comment(