up

[Tests] Fix ControlNet Single File tests (#7315 )
2024-03-18 11:47:47 +01:00 · 2024-03-18 11:34:17 +01:00 · 2024-03-18 11:28:59 +05:30 · 2024-03-18 10:52:20 +05:30 · 2024-03-16 07:18:11 +05:30 · 2024-03-16 07:11:28 +05:30
160 changed files with 5617 additions and 430 deletions
@@ -12,6 +12,7 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

 jobs:
  run_nightly_tests:
@@ -64,6 +65,7 @@ jobs:
          python -m uv pip install -e [quality,test]
          python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          python -m uv pip install pytest-reportlog

      - name: Environment
        run: |
@@ -78,7 +80,8 @@ jobs:
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_${{ matrix.config.report }} \
-            tests/
+            --report-log=${{ matrix.config.report }}.log \
+            tests/ 

      - name: Run nightly Flax TPU tests
        if: ${{ matrix.config.framework == 'flax' }}
@@ -89,6 +92,7 @@ jobs:
          python -m pytest -n 0 \
            -s -v -k "Flax" \
            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \
            tests/

      - name: Run nightly ONNXRuntime CUDA tests
@@ -100,6 +104,7 @@ jobs:
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "Onnx" \
            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \ 
            tests/

      - name: Failure short reports
@@ -112,6 +117,12 @@ jobs:
        with:
          name: ${{ matrix.config.report }}_test_reports
          path: reports
+      
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
@@ -140,6 +151,7 @@ jobs:
          ${CONDA_RUN} python -m uv pip install -e [quality,test]
          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          ${CONDA_RUN} python -m uv pip install pytest-reportlog

      - name: Environment
        shell: arch -arch arm64 bash {0}
@@ -152,7 +164,9 @@ jobs:
          HF_HOME: /System/Volumes/Data/mnt/cache
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
-          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
+          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+            --report-log=tests_torch_mps.log \
+            tests/

      - name: Failure short reports
        if: ${{ failure() }}
@@ -164,3 +178,9 @@ jobs:
        with:
          name: torch_mps_test_reports
          path: reports
+
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
@@ -0,0 +1,23 @@
+name: Notify Slack about a release
+
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    
+    - name: Notify Slack about the release
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+      run: pip install requests && python utils/notify_slack_about_release.py
@@ -0,0 +1,81 @@
+# Adapted from https://blog.deepjyoti30.dev/pypi-release-github-action
+
+name: PyPI release
+
+on:
+  workflow_dispatch:
+  push:
+    tags:
+      - "*"
+
+jobs:
+  find-and-checkout-latest-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+
+      - name: Fetch latest branch
+        id: fetch_latest_branch
+        run: |
+          pip install -U requests packaging
+          LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py)
+          echo "Latest branch: $LATEST_BRANCH"
+          echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV
+          
+      - name: Set latest branch output
+        id: set_latest_branch
+        run: echo "::set-output name=latest_branch::${{ env.latest_branch }}"
+
+  release:
+    needs: find-and-checkout-latest-branch
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }}
+          
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -U setuptools wheel twine
+          pip install -U torch --index-url https://download.pytorch.org/whl/cpu
+          pip install -U transformers
+      
+      - name: Build the dist files
+        run: python setup.py bdist_wheel && python setup.py sdist
+      
+      - name: Publish to the test PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/    
+
+      - name: Test installing diffusers and importing
+        run: |
+          pip install diffusers && pip uninstall diffusers -y
+          pip install -i https://testpypi.python.org/pypi diffusers
+          python -c "from diffusers import __version__; print(__version__)"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
+          python -c "from diffusers import *"
+
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypi
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 19000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +8000 other amazing GitHub repositories 💪
+- +9000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

@@ -104,6 +104,8 @@
      title: Latent Consistency Model-LoRA
    - local: using-diffusers/inference_with_lcm
      title: Latent Consistency Model
+    - local: using-diffusers/inference_with_tcd_lora
+      title: Trajectory Consistency Distillation-LoRA
    - local: using-diffusers/svd
      title: Stable Video Diffusion
    title: Specific pipeline examples
@@ -304,6 +306,8 @@
      title: Latent Consistency Models
    - local: api/pipelines/latent_diffusion
      title: Latent Diffusion
+    - local: api/pipelines/ledits_pp
+      title: LEDITS++
    - local: api/pipelines/panorama
      title: MultiDiffusion
    - local: api/pipelines/musicldm
@@ -396,6 +400,10 @@
      title: DPMSolverSDEScheduler
    - local: api/schedulers/singlestep_dpm_solver
      title: DPMSolverSinglestepScheduler
+    - local: api/schedulers/edm_multistep_dpm_solver
+      title: EDMDPMSolverMultistepScheduler
+    - local: api/schedulers/edm_euler
+      title: EDMEulerScheduler
    - local: api/schedulers/euler_ancestral
      title: EulerAncestralDiscreteScheduler
    - local: api/schedulers/euler
@@ -0,0 +1,54 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LEDITS++
+
+LEDITS++ was proposed in [LEDITS++: Limitless Image Editing using Text-to-Image Models](https://huggingface.co/papers/2311.16711) by Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, Apolinário Passos.
+
+The abstract from the paper is:
+
+*Text-to-image diffusion models have recently received increasing interest for their astonishing ability to produce high-fidelity images from solely text inputs. Subsequent research efforts aim to exploit and apply their capabilities to real image editing. However, existing image-to-image methods are often inefficient, imprecise, and of limited versatility. They either require time-consuming fine-tuning, deviate unnecessarily strongly from the input image, and/or lack support for multiple, simultaneous edits. To address these issues, we introduce LEDITS++, an efficient yet versatile and precise textual image manipulation technique. LEDITS++'s novel inversion approach requires no tuning nor optimization and produces high-fidelity results with a few diffusion steps. Second, our methodology supports multiple simultaneous edits and is architecture-agnostic. Third, we use a novel implicit masking technique that limits changes to relevant image regions. We propose the novel TEdBench++ benchmark as part of our exhaustive evaluation. Our results demonstrate the capabilities of LEDITS++ and its improvements over previous methods. The project page is available at https://leditsplusplus-project.static.hf.space .*
+
+<Tip>
+
+You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
+
+</Tip>
+
+<Tip warning={true}>
+Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion. 
+This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp). 
+</Tip>
+
+We provide two distinct pipelines based on different pre-trained models. 
+
+## LEditsPPPipelineStableDiffusion
+[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusion
+	- all
+	- __call__
+	- invert
+
+## LEditsPPPipelineStableDiffusionXL
+[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL
+	- all
+	- __call__
+	- invert
+
+
+
+## LEditsPPDiffusionPipelineOutput
+[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPDiffusionPipelineOutput
+	- all
+
+## LEditsPPInversionPipelineOutput
+[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPInversionPipelineOutput
+	- all
@@ -57,6 +57,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Latent Consistency Models](latent_consistency_models) | text2image |
 | [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
 | [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
+| [LEDITS++](ledits_pp) | image editing |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [Paint by Example](paint_by_example) | inpainting |
@@ -30,6 +30,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

-## StableDiffusionSafePipelineOutput
+## SemanticStableDiffusionPipelineOutput
 [[autodoc]] pipelines.semantic_stable_diffusion.pipeline_output.SemanticStableDiffusionPipelineOutput
 	- all
@@ -172,3 +172,41 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)

 # now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
 ```
+
+### Create web demos using `gradio`
+
+The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
+
+```
+pip install -U gradio
+```
+
+Then, create a web demo around any Stable Diffusion-based pipeline. For example, you can create an image generation pipeline in a single line of code with Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function:
+
+```py
+from diffusers import StableDiffusionPipeline
+import gradio as gr
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+which opens an intuitive drag-and-drop interface in your browser:
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gradio-panda.png)
+
+Similarly, you could create a demo for an image-to-image pipeline with:
+
+```py
+from diffusers import StableDiffusionImg2ImgPipeline
+import gradio as gr
+
+
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
+link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link. 
@@ -0,0 +1,22 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# EDMEulerScheduler
+
+The Karras formulation of the Euler scheduler (Algorithm 2) from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
+
+
+## EDMEulerScheduler
+[[autodoc]] EDMEulerScheduler
+
+## EDMEulerSchedulerOutput
+[[autodoc]] schedulers.scheduling_edm_euler.EDMEulerSchedulerOutput
@@ -0,0 +1,24 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# EDMDPMSolverMultistepScheduler
+
+`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistep`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+
+DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
+samples, and it can generate quite good samples even in 10 steps.
+
+## EDMDPMSolverMultistepScheduler
+[[autodoc]] EDMDPMSolverMultistepScheduler
+
+## SchedulerOutput
+[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
@@ -0,0 +1,438 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+[[open-in-colab]]
+
+# Trajectory Consistency Distillation-LoRA
+
+Trajectory Consistency Distillation (TCD) enables a model to generate higher quality and more detailed images with fewer steps. Moreover, owing to the effective error mitigation during the distillation process, TCD demonstrates superior performance even under conditions of large inference steps.
+
+The major advantages of TCD are:
+
+- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
+
+- Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
+
+- Freely change detail level: During inference, the level of detail in the image can be adjusted with a single hyperparameter, *gamma*.
+
+> [!TIP]
+> For more technical details of TCD, please refer to the [paper](https://arxiv.org/abs/2402.19159) or official [project page](https://mhh0318.github.io/tcd/)).
+
+For large models like SDXL, TCD is trained with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to reduce memory usage. This is also useful because you can reuse LoRAs between different finetuned models, as long as they share the same base model, without further training.
+
+
+
+This guide will show you how to perform inference with TCD-LoRAs for a variety of tasks like text-to-image and inpainting, as well as how you can easily combine TCD-LoRAs with other adapters. Choose one of the supported base model and it's corresponding TCD-LoRA checkpoint from the table below to get started.
+
+| Base model                                                                                      | TCD-LoRA checkpoint                                            |
+|-------------------------------------------------------------------------------------------------|----------------------------------------------------------------|
+| [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)                  | [TCD-SD15](https://huggingface.co/h1t/TCD-SD15-LoRA)           |
+| [stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)       | [TCD-SD21-base](https://huggingface.co/h1t/TCD-SD21-base-LoRA) |
+| [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | [TCD-SDXL](https://huggingface.co/h1t/TCD-SDXL-LoRA)           |
+
+
+Make sure you have [PEFT](https://github.com/huggingface/peft) installed for better LoRA support.
+
+```bash
+pip install -U peft
+```
+
+## General tasks
+
+In this guide, let's use the [`StableDiffusionXLPipeline`] and the [`TCDScheduler`]. Use the [`~StableDiffusionPipeline.load_lora_weights`] method to load the SDXL-compatible TCD-LoRA weights.
+
+A few tips to keep in mind for TCD-LoRA inference are to:
+
+- Keep the `num_inference_steps` between 4 and 50
+- Set `eta` (used to control stochasticity at each step) between 0 and 1. You should use a higher `eta` when increasing the number of inference steps, but the downside is that a larger `eta` in [`TCDScheduler`] leads to blurrier images. A value of 0.3 is recommended to produce good results.
+
+<hfoptions id="tasks">
+<hfoption id="text-to-image">
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, TCDScheduler
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=4,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/demo_image.png)
+
+</hfoption>
+
+<hfoption id="inpainting">
+
+```python
+import torch
+from diffusers import AutoPipelineForInpainting, TCDScheduler
+from diffusers.utils import load_image, make_image_grid
+
+device = "cuda"
+base_model_id = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = AutoPipelineForInpainting.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = load_image(img_url).resize((1024, 1024))
+mask_image = load_image(mask_url).resize((1024, 1024))
+
+prompt = "a tiger sitting on a park bench"
+
+image = pipe(
+  prompt=prompt,
+  image=init_image,
+  mask_image=mask_image,
+  num_inference_steps=8,
+  guidance_scale=0,
+  eta=0.3,
+  strength=0.99,  # make sure to use `strength` below 1.0
+  generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/inpainting_tcd.png)
+
+
+</hfoption>
+</hfoptions>
+
+## Community models
+
+TCD-LoRA also works with many community finetuned models and plugins. For example, load the [animagine-xl-3.0](https://huggingface.co/cagliostrolab/animagine-xl-3.0) checkpoint which is a community finetuned version of SDXL for generating anime images.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, TCDScheduler
+
+device = "cuda"
+base_model_id = "cagliostrolab/animagine-xl-3.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "A man, clad in a meticulously tailored military uniform, stands with unwavering resolve. The uniform boasts intricate details, and his eyes gleam with determination. Strands of vibrant, windswept hair peek out from beneath the brim of his cap."
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=8,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/animagine_xl.png)
+
+TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. 
+
+> [!TIP]
+> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+styled_lora_id = "TheLastBen/Papercut_SDXL"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id, adapter_name="tcd")
+pipe.load_lora_weights(styled_lora_id, adapter_name="style")
+pipe.set_adapters(["tcd", "style"], adapter_weights=[1.0, 1.0])
+
+prompt = "papercut of a winter mountain, snow"
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=4,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/styled_lora.png)
+
+
+## Adapters
+
+TCD-LoRA is very versatile, and it can be combined with other adapter types like ControlNets, IP-Adapter, and AnimateDiff.
+
+<hfoptions id="adapters">
+<hfoption id="ControlNet">
+
+### Depth ControlNet
+
+```python
+import torch
+import numpy as np
+from PIL import Image
+from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
+from diffusers.utils import load_image, make_image_grid
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
+feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+
+def get_depth_map(image):
+    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
+    with torch.no_grad(), torch.autocast(device):
+        depth_map = depth_estimator(image).predicted_depth
+
+    depth_map = torch.nn.functional.interpolate(
+        depth_map.unsqueeze(1),
+        size=(1024, 1024),
+        mode="bicubic",
+        align_corners=False,
+    )
+    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    image = torch.cat([depth_map] * 3, dim=1)
+
+    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+    return image
+
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+controlnet_id = "diffusers/controlnet-depth-sdxl-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+controlnet = ControlNetModel.from_pretrained(
+    controlnet_id,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_id,
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe.enable_model_cpu_offload()
+
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "stormtrooper lecture, photorealistic"
+
+image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
+depth_image = get_depth_map(image)
+
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+image = pipe(
+    prompt, 
+    image=depth_image, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_depth_tcd.png)
+
+### Canny ControlNet
+```python
+import torch
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
+from diffusers.utils import load_image, make_image_grid
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+controlnet_id = "diffusers/controlnet-canny-sdxl-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+controlnet = ControlNetModel.from_pretrained(
+    controlnet_id,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_id,
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe.enable_model_cpu_offload()
+
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "ultrarealistic shot of a furry blue bird"
+
+canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png")
+
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+image = pipe(
+    prompt, 
+    image=canny_image, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
+```
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png)
+
+<Tip>
+The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
+</Tip>
+
+</hfoption>
+<hfoption id="IP-Adapter">
+
+This example shows how to use the TCD-LoRA with the [IP-Adapter](https://github.com/tencent-ailab/IP-Adapter/tree/main) and SDXL.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers.utils import load_image, make_image_grid
+
+from ip_adapter import IPAdapterXL
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+image_encoder_path = "sdxl_models/image_encoder"
+ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    base_model_path, 
+    torch_dtype=torch.float16, 
+    variant="fp16"
+)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device)
+
+ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapter/main/assets/images/woman.png").resize((512, 512))
+
+prompt = "best quality, high quality, wearing sunglasses"
+
+image = ip_model.generate(
+    pil_image=ref_image, 
+    prompt=prompt,
+    scale=0.5,
+    num_samples=1, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3, 
+    seed=0,
+)[0]
+
+grid_image = make_image_grid([ref_image, image], rows=1, cols=2)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/ip_adapter.png)
+
+
+
+</hfoption>
+<hfoption id="AnimateDiff">
+
+[`AnimateDiff`] allows animating images using Stable Diffusion models. TCD-LoRA can substantially accelerate the process without degrading image quality. The quality of animation with TCD-LoRA and AnimateDiff has a more lucid outcome.
+
+```python
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from scheduling_tcd import TCDScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
+pipe = AnimateDiffPipeline.from_pretrained(
+    "frankjoshua/toonyou_beta6",
+    motion_adapter=adapter,
+).to("cuda")
+
+# set TCDScheduler
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+# load TCD LoRA
+pipe.load_lora_weights("h1t/TCD-SD15-LoRA", adapter_name="tcd")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
+
+pipe.set_adapters(["tcd", "motion-lora"], adapter_weights=[1.0, 1.2])
+
+prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
+generator = torch.manual_seed(0)
+frames = pipe(
+    prompt=prompt,
+    num_inference_steps=5,
+    guidance_scale=0,
+    cross_attention_kwargs={"scale": 1},
+    num_frames=24,
+    eta=0.3,
+    generator=generator
+).frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/animation_example.gif)
+
+</hfoption>
+</hfoptions>
@@ -259,6 +259,50 @@ pip install git+https://github.com/huggingface/peft.git
 **Inference** 
 The inference is the same as if you train a regular LoRA 🤗

+## Conducting EDM-style training
+
+It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364). 
+
+simply set:
+
+```diff
+  --do_edm_style_training \
+```
+
+Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
+
+```bash
+accelerate launch train_dreambooth_lora_sdxl_advanced.py \
+  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
+  --dataset_name="linoyts/3d_icon" \
+  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
+  --output_dir="3d-icon-SDXL-LoRA" \
+  --do_edm_style_training \
+  --caption_column="prompt" \
+  --mixed_precision="bf16" \
+  --resolution=1024 \
+  --train_batch_size=3 \
+  --repeats=1 \
+  --report_to="wandb"\
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1.0 \
+  --text_encoder_lr=1.0 \
+  --optimizer="prodigy"\
+  --train_text_encoder_ti\
+  --train_text_encoder_ti_frac=0.5\
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --rank=8 \
+  --max_train_steps=1000 \
+  --checkpointing_steps=2000 \
+  --seed="0" \
+  --push_to_hub
+```
+
+> [!CAUTION]
+> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".

 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
@@ -70,7 +70,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and

 import argparse
+import contextlib
 import gc
 import hashlib
 import itertools
+import json
 import logging
 import math
 import os
@@ -37,7 +39,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
@@ -55,6 +57,8 @@ from diffusers import (
    AutoencoderKL,
    DDPMScheduler,
    DPMSolverMultistepScheduler,
+    EDMEulerScheduler,
+    EulerDiscreteScheduler,
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
 )
@@ -74,11 +78,25 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)


+def determine_scheduler_type(pretrained_model_name_or_path, revision):
+    model_index_filename = "model_index.json"
+    if os.path.isdir(pretrained_model_name_or_path):
+        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
+    else:
+        model_index = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
+        )
+
+    with open(model_index, "r") as f:
+        scheduler_type = json.load(f)["scheduler"][1]
+    return scheduler_type
+
+
 def save_model_card(
    repo_id: str,
    use_dora: bool,
@@ -370,6 +388,11 @@ def parse_args(input_args=None):
            " `args.validation_prompt` multiple times: `args.num_validation_images`."
        ),
    )
+    parser.add_argument(
+        "--do_edm_style_training",
+        action="store_true",
+        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
+    )
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1117,6 +1140,8 @@ def main(args):
            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
            " Please use `huggingface-cli login` to authenticate with the Hub."
        )
+    if args.do_edm_style_training and args.snr_gamma is not None:
+        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")

    logging_dir = Path(args.output_dir, args.logging_dir)

@@ -1234,7 +1259,19 @@ def main(args):
    )

    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
+    if "EDM" in scheduler_type:
+        args.do_edm_style_training = True
+        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        logger.info("Performing EDM-style training!")
+    elif args.do_edm_style_training:
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="scheduler"
+        )
+        logger.info("Performing EDM-style training!")
+    else:
+        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
    )
@@ -1252,7 +1289,12 @@ def main(args):
        revision=args.revision,
        variant=args.variant,
    )
-    vae_scaling_factor = vae.config.scaling_factor
+    latents_mean = latents_std = None
+    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
+        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
+    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
+        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
+
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
    )
@@ -1790,6 +1832,19 @@ def main(args):
        disable=not accelerator.is_local_main_process,
    )

+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        # TODO: revisit other sampling algorithms
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
    if args.train_text_encoder:
        num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
    elif args.train_text_encoder_ti:  # args.train_text_encoder_ti
@@ -1841,9 +1896,15 @@ def main(args):
                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
                    model_input = vae.encode(pixel_values).latent_dist.sample()

-                model_input = model_input * vae_scaling_factor
-                if args.pretrained_vae_model_name_or_path is None:
-                    model_input = model_input.to(weight_dtype)
+                if latents_mean is None and latents_std is None:
+                    model_input = model_input * vae.config.scaling_factor
+                    if args.pretrained_vae_model_name_or_path is None:
+                        model_input = model_input.to(weight_dtype)
+                else:
+                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
+                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
+                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
+                    model_input = model_input.to(dtype=weight_dtype)

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(model_input)
@@ -1854,15 +1915,32 @@ def main(args):
                    )

                bsz = model_input.shape[0]
+
                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                )
-                timesteps = timesteps.long()
+                if not args.do_edm_style_training:
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                else:
+                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
+                    # instead of discrete timesteps, so here we sample indices to get the noise levels
+                    # from `scheduler.timesteps`
+                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
+                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
+                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                if args.do_edm_style_training:
+                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
+                    if "EDM" in scheduler_type:
+                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
+                    else:
+                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)

                # time ids
                add_time_ids = torch.cat(
@@ -1888,7 +1966,7 @@ def main(args):
                    }
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        noisy_model_input,
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
@@ -1906,14 +1984,42 @@ def main(args):
                    )
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        noisy_model_input, timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
                    ).sample

+                weighting = None
+                if args.do_edm_style_training:
+                    # Similar to the input preconditioning, the model predictions are also preconditioned
+                    # on noised model inputs (before preconditioning) and the sigmas.
+                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                    if "EDM" in scheduler_type:
+                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
+                    else:
+                        if noise_scheduler.config.prediction_type == "epsilon":
+                            model_pred = model_pred * (-sigmas) + noisy_model_input
+                        elif noise_scheduler.config.prediction_type == "v_prediction":
+                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
+                                noisy_model_input / (sigmas**2 + 1)
+                            )
+                    # We are not doing weighting here because it tends result in numerical problems.
+                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+                    # There might be other alternatives for weighting as well:
+                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
+                    if "EDM" not in scheduler_type:
+                        weighting = (sigmas**-2.0).float()
+
                # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
+                    target = model_input if args.do_edm_style_training else noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    target = (
+                        model_input
+                        if args.do_edm_style_training
+                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    )
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

@@ -1923,10 +2029,28 @@ def main(args):
                    target, target_prior = torch.chunk(target, 2, dim=0)

                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    if weighting is not None:
+                        prior_loss = torch.mean(
+                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                                target_prior.shape[0], -1
+                            ),
+                            1,
+                        )
+                        prior_loss = prior_loss.mean()
+                    else:
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")

                if args.snr_gamma is None:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    if weighting is not None:
+                        loss = torch.mean(
+                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
+                                target.shape[0], -1
+                            ),
+                            1,
+                        )
+                        loss = loss.mean()
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                else:
                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
@@ -2049,17 +2173,18 @@ def main(args):
                # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
                scheduler_args = {}

-                if "variance_type" in pipeline.scheduler.config:
-                    variance_type = pipeline.scheduler.config.variance_type
+                if not args.do_edm_style_training:
+                    if "variance_type" in pipeline.scheduler.config:
+                        variance_type = pipeline.scheduler.config.variance_type

-                    if variance_type in ["learned", "learned_range"]:
-                        variance_type = "fixed_small"
+                        if variance_type in ["learned", "learned_range"]:
+                            variance_type = "fixed_small"

-                    scheduler_args["variance_type"] = variance_type
+                        scheduler_args["variance_type"] = variance_type

-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    pipeline.scheduler.config, **scheduler_args
-                )
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+                        pipeline.scheduler.config, **scheduler_args
+                    )

                pipeline = pipeline.to(accelerator.device)
                pipeline.set_progress_bar_config(disable=True)
@@ -2067,8 +2192,13 @@ def main(args):
                # run inference
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}
+                inference_ctx = (
+                    contextlib.nullcontext()
+                    if "playground" in args.pretrained_model_name_or_path
+                    else torch.cuda.amp.autocast()
+                )

-                with torch.cuda.amp.autocast():
+                with inference_ctx:
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -2144,15 +2274,18 @@ def main(args):
            # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
            scheduler_args = {}

-            if "variance_type" in pipeline.scheduler.config:
-                variance_type = pipeline.scheduler.config.variance_type
+            if not args.do_edm_style_training:
+                if "variance_type" in pipeline.scheduler.config:
+                    variance_type = pipeline.scheduler.config.variance_type

-                if variance_type in ["learned", "learned_range"]:
-                    variance_type = "fixed_small"
+                    if variance_type in ["learned", "learned_range"]:
+                        variance_type = "fixed_small"

-                scheduler_args["variance_type"] = variance_type
+                    scheduler_args["variance_type"] = variance_type

-            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+                    pipeline.scheduler.config, **scheduler_args
+                )

            # load attention processors
            pipeline.load_lora_weights(args.output_dir)
@@ -513,9 +513,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -418,9 +418,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -40,7 +40,7 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")


 class MarigoldDepthOutput(BaseOutput):
@@ -171,9 +171,7 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -65,7 +65,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -71,7 +71,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -77,7 +77,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -1178,7 +1178,7 @@ def main(args):
                        grads_text_encoder = text_encoder.get_input_embeddings().weight.grad
                    # Get the index for tokens that we want to zero the grads for
                    index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
-                    for i in range(len(modifier_token_id[1:])):
+                    for i in range(1, len(modifier_token_id)):
                        index_grads_to_zero = index_grads_to_zero & (
                            torch.arange(len(tokenizer)) != modifier_token_id[i]
                        )
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -75,7 +75,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -53,7 +53,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -59,7 +59,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -637,7 +637,7 @@ def main(args):
                    generator=generator,
                    batch_size=args.eval_batch_size,
                    num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy",
+                    output_type="np",
                ).images

                if args.use_ema:
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -56,7 +56,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -49,7 +49,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -52,7 +52,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -64,7 +64,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -425,6 +425,11 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
+    parser.add_argument(
+        "--debug_loss",
+        action="store_true",
+        help="debug loss for each image, if filenames are awailable in the dataset",
+    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -603,6 +608,7 @@ def main(args):
    # Move unet, vae and text_encoder to device and cast to weight_dtype
    # The VAE is in float32 to avoid NaN losses.
    unet.to(accelerator.device, dtype=weight_dtype)
+
    if args.pretrained_vae_model_name_or_path is None:
        vae.to(accelerator.device, dtype=torch.float32)
    else:
@@ -890,13 +896,17 @@ def main(args):
        tokens_one, tokens_two = tokenize_captions(examples)
        examples["input_ids_one"] = tokens_one
        examples["input_ids_two"] = tokens_two
+        if args.debug_loss:
+            fnames = [os.path.basename(image.filename) for image in examples[image_column] if image.filename]
+            if fnames:
+                examples["filenames"] = fnames
        return examples

    with accelerator.main_process_first():
        if args.max_train_samples is not None:
            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)

    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
@@ -905,7 +915,7 @@ def main(args):
        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
-        return {
+        result = {
            "pixel_values": pixel_values,
            "input_ids_one": input_ids_one,
            "input_ids_two": input_ids_two,
@@ -913,6 +923,11 @@ def main(args):
            "crop_top_lefts": crop_top_lefts,
        }

+        filenames = [example["filenames"] for example in examples if "filenames" in example]
+        if filenames:
+            result["filenames"] = filenames
+        return result
+
    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
@@ -1105,7 +1120,9 @@ def main(args):
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                    loss = loss.mean()
-
+                if args.debug_loss and "filenames" in batch:
+                    for fname in batch["filenames"]:
+                        accelerator.log({"loss_for_" + fname: loss}, step=global_step)
                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
                train_loss += avg_loss.item() / args.gradient_accumulation_steps
@@ -54,7 +54,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -80,7 +80,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -56,7 +56,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -76,7 +76,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -648,7 +648,7 @@ def main(args):
                    generator=generator,
                    batch_size=args.eval_batch_size,
                    num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy",
+                    output_type="np",
                ).images

                if args.use_ema:
@@ -50,7 +50,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -0,0 +1,139 @@
+import argparse
+import json
+import os
+from datetime import date
+from pathlib import Path
+
+from slack_sdk import WebClient
+from tabulate import tabulate
+
+
+MAX_LEN_MESSAGE = 2900  # slack endpoint has a limit of 3001 characters
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--slack_channel_name", default="diffusers-ci-nightly")
+
+
+def main(slack_channel_name=None):
+    failed = []
+    passed = []
+
+    group_info = []
+
+    total_num_failed = 0
+    empty_file = False or len(list(Path().glob("*.log"))) == 0
+
+    total_empty_files = []
+
+    for log in Path().glob("*.log"):
+        section_num_failed = 0
+        i = 0
+        with open(log) as f:
+            for line in f:
+                line = json.loads(line)
+                i += 1
+                if line.get("nodeid", "") != "":
+                    test = line["nodeid"]
+                    if line.get("duration", None) is not None:
+                        duration = f'{line["duration"]:.4f}'
+                        if line.get("outcome", "") == "failed":
+                            section_num_failed += 1
+                            failed.append([test, duration, log.name.split("_")[0]])
+                            total_num_failed += 1
+                        else:
+                            passed.append([test, duration, log.name.split("_")[0]])
+            empty_file = i == 0
+        group_info.append([str(log), section_num_failed, failed])
+        total_empty_files.append(empty_file)
+        os.remove(log)
+        failed = []
+    text = (
+        "🌞 There were no failures!"
+        if not any(total_empty_files)
+        else "Something went wrong there is at least one empty file - please check GH action results."
+    )
+    no_error_payload = {
+        "type": "section",
+        "text": {
+            "type": "plain_text",
+            "text": text,
+            "emoji": True,
+        },
+    }
+
+    message = ""
+    payload = [
+        {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": "🤗 Results of the Diffusers scheduled nightly tests.",
+            },
+        },
+    ]
+    if total_num_failed > 0:
+        for i, (name, num_failed, failed_tests) in enumerate(group_info):
+            if num_failed > 0:
+                if num_failed == 1:
+                    message += f"*{name}: {num_failed} failed test*\n"
+                else:
+                    message += f"*{name}: {num_failed} failed tests*\n"
+                failed_table = []
+                for test in failed_tests:
+                    failed_table.append(test[0].split("::"))
+                failed_table = tabulate(
+                    failed_table,
+                    headers=["Test Location", "Test Case", "Test Name"],
+                    showindex="always",
+                    tablefmt="grid",
+                    maxcolwidths=[12, 12, 12],
+                )
+                message += "\n```\n" + failed_table + "\n```"
+
+            if total_empty_files[i]:
+                message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
+        print(f"### {message}")
+    else:
+        payload.append(no_error_payload)
+
+    if len(message) > MAX_LEN_MESSAGE:
+        print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
+        message = message[:MAX_LEN_MESSAGE] + "..."
+
+    if len(message) != 0:
+        md_report = {
+            "type": "section",
+            "text": {"type": "mrkdwn", "text": message},
+        }
+        payload.append(md_report)
+        action_button = {
+            "type": "section",
+            "text": {"type": "mrkdwn", "text": "*For more details:*"},
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/diffusers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+        payload.append(action_button)
+
+    date_report = {
+        "type": "context",
+        "elements": [
+            {
+                "type": "plain_text",
+                "text": f"Nightly test results for {date.today()}",
+            },
+        ],
+    }
+    payload.append(date_report)
+
+    print(payload)
+
+    client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
+    client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.slack_channel_name)
@@ -249,7 +249,7 @@ version_range_max = max(sys.version_info[1], 10) + 1

 setup(
    name="diffusers",
-    version="0.27.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.28.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="State-of-the-art diffusion in PyTorch and JAX.",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
@@ -1,4 +1,4 @@
-__version__ = "0.27.0.dev0"
+__version__ = "0.28.0.dev0"

 from typing import TYPE_CHECKING

@@ -253,6 +253,8 @@ else:
            "LatentConsistencyModelImg2ImgPipeline",
            "LatentConsistencyModelPipeline",
            "LDMTextToImagePipeline",
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
            "MusicLDMPipeline",
            "PaintByExamplePipeline",
            "PIAPipeline",
@@ -623,6 +625,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LatentConsistencyModelImg2ImgPipeline,
            LatentConsistencyModelPipeline,
            LDMTextToImagePipeline,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
            MusicLDMPipeline,
            PaintByExamplePipeline,
            PIAPipeline,
@@ -293,7 +293,7 @@ class BasicTransformerBlock(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        # Notice that normalization is always applied before the real computation in the following blocks.
        # 0. Self-Attention
@@ -767,7 +767,18 @@ class AttnProcessor:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -1259,7 +1270,18 @@ class AttnProcessor2_0:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -308,7 +308,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        """
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -846,7 +846,7 @@ class UNetMidBlock2DCrossAttn(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        hidden_states = self.resnets[0](hidden_states, temb)
        for attn, resnet in zip(self.attentions, self.resnets[1:]):
@@ -986,7 +986,7 @@ class UNetMidBlock2DSimpleCrossAttn(nn.Module):
    ) -> torch.FloatTensor:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        if attention_mask is None:
            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
@@ -1116,7 +1116,7 @@ class AttnDownBlock2D(nn.Module):
    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        output_states = ()

@@ -1241,7 +1241,7 @@ class CrossAttnDownBlock2D(nn.Module):
    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        output_states = ()

@@ -1986,7 +1986,7 @@ class SimpleCrossAttnDownBlock2D(nn.Module):
    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        output_states = ()

@@ -2201,7 +2201,7 @@ class KCrossAttnDownBlock2D(nn.Module):
    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        output_states = ()

@@ -2483,7 +2483,7 @@ class CrossAttnUpBlock2D(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        is_freeu_enabled = (
            getattr(self, "s1", None)
@@ -3312,7 +3312,7 @@ class SimpleCrossAttnUpBlock2D(nn.Module):
    ) -> torch.FloatTensor:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        if attention_mask is None:
            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
@@ -3694,7 +3694,7 @@ class KAttentionBlock(nn.Module):
    ) -> torch.FloatTensor:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        # 1. Self-Attention
        if self.add_self_attention:
@@ -80,7 +80,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip the sin to cos in the time embedding.
        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
@@ -109,7 +109,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+        reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
@@ -147,9 +147,9 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
            The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
-        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
-        *optional*): The dimension of the `class_labels` input when
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
            embeddings with the class embeddings.
@@ -1183,7 +1183,7 @@ class CrossAttnDownBlockMotion(nn.Module):
    ):
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        output_states = ()

@@ -1367,7 +1367,7 @@ class CrossAttnUpBlockMotion(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        is_freeu_enabled = (
            getattr(self, "s1", None)
@@ -1707,7 +1707,7 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        hidden_states = self.resnets[0](hidden_states, temb)

@@ -23,6 +23,7 @@ _import_structure = {
    "controlnet_xs": [],
    "deprecated": [],
    "latent_diffusion": [],
+    "ledits_pp": [],
    "stable_diffusion": [],
    "stable_diffusion_xl": [],
 }
@@ -171,6 +172,12 @@ else:
        "LatentConsistencyModelPipeline",
    ]
    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
+    _import_structure["ledits_pp"].extend(
+        [
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
+        ]
+    )
    _import_structure["musicldm"] = ["MusicLDMPipeline"]
    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
    _import_structure["pia"] = ["PIAPipeline"]
@@ -424,6 +431,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LatentConsistencyModelPipeline,
        )
        from .latent_diffusion import LDMTextToImagePipeline
+        from .ledits_pp import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
+        )
        from .musicldm import MusicLDMPipeline
        from .paint_by_example import PaintByExamplePipeline
        from .pia import PIAPipeline
@@ -127,7 +127,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
                essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 16):
+            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 10.0):
@@ -191,7 +191,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
        ):
            raise ValueError(
-                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+                "pass either both `negative_prompt_embeds` and `negative_encoder_hidden_states` or neither"
            )

        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
@@ -824,20 +824,22 @@ class StableDiffusionControlNetPipeline(
        return latents

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -869,20 +869,22 @@ class StableDiffusionXLControlNetPipeline(
            self.vae.decoder.mid_block.to(dtype)

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -133,7 +133,7 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
        generator: Optional[torch.Generator] = None,
        num_inference_steps: int = 100,
        return_dict: bool = True,
-        output_type: str = "numpy",
+        output_type: str = "np",
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
    ) -> Union[AudioPipelineOutput, Tuple]:
@@ -157,7 +157,7 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
                expense of slower inference.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
-            output_type (`str`, *optional*, defaults to `"numpy"`):
+            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated audio.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
@@ -249,16 +249,16 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):

            logger.info("Generated segment", i)

-        if output_type == "numpy" and not is_onnx_available():
+        if output_type == "np" and not is_onnx_available():
            raise ValueError(
                "Cannot return output in 'np' format if ONNX is not available. Make sure to have ONNX installed or set 'output_type' to 'mel'."
            )
-        elif output_type == "numpy" and self.melgan is None:
+        elif output_type == "np" and self.melgan is None:
            raise ValueError(
                "Cannot return output in 'np' format if melgan component is not defined. Make sure to define `self.melgan` or set 'output_type' to 'mel'."
            )

-        if output_type == "numpy":
+        if output_type == "np":
            output = self.melgan(input_features=full_pred_mel.astype(np.float32))
        else:
            output = full_pred_mel
@@ -2004,7 +2004,7 @@ class CrossAttnUpBlockFlat(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        is_freeu_enabled = (
            getattr(self, "s1", None)
@@ -2338,7 +2338,7 @@ class UNetMidBlockFlatCrossAttn(nn.Module):
    ) -> torch.FloatTensor:
        if cross_attention_kwargs is not None:
            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        hidden_states = self.resnets[0](hidden_states, temb)
        for attn, resnet in zip(self.attentions, self.resnets[1:]):
@@ -2479,7 +2479,7 @@ class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
    ) -> torch.FloatTensor:
        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
        if cross_attention_kwargs.get("scale", None) is not None:
-            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")

        if attention_mask is None:
            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
@@ -548,20 +548,22 @@ class LatentConsistencyModelImg2ImgPipeline(
        return latents

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -490,20 +490,22 @@ class LatentConsistencyModelPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_leditspp_stable_diffusion"] = ["LEditsPPPipelineStableDiffusion"]
+    _import_structure["pipeline_leditspp_stable_diffusion_xl"] = ["LEditsPPPipelineStableDiffusionXL"]
+
+    _import_structure["pipeline_output"] = ["LEditsPPDiffusionPipelineOutput", "LEditsPPDiffusionPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_leditspp_stable_diffusion import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+        )
+        from .pipeline_leditspp_stable_diffusion_xl import LEditsPPPipelineStableDiffusionXL
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class LEditsPPDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+@dataclass
+class LEditsPPInversionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        input_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
+            (batch_size, height, width, num_channels)`.
+        vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape `
+            (batch_size, height, width, num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    vae_reconstruction_images: Union[List[PIL.Image.Image], np.ndarray]
@@ -669,20 +669,22 @@ class StableDiffusionPipeline(
        return latents

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -767,20 +767,22 @@ class StableDiffusionImg2ImgPipeline(
        return latents

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -909,20 +909,22 @@ class StableDiffusionInpaintPipeline(
        return timesteps, num_inference_steps - t_start

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -1304,7 +1304,7 @@ class StableDiffusionDiffEditPipeline(
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_ckip: int = None,
+        clip_skip: int = None,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -1426,7 +1426,7 @@ class StableDiffusionDiffEditPipeline(
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
-            clip_skip=clip_ckip,
+            clip_skip=clip_skip,
        )
        # For classifier free guidance, we need to do two forward passes.
        # Here we concatenate the unconditional and text embeddings into a single batch
@@ -644,20 +644,22 @@ class StableDiffusionLDM3DPipeline(
        return latents

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -632,7 +632,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
        # and `sag_scale` is` `s` of equation (16)
-        # of the self-attentnion guidance paper: https://arxiv.org/pdf/2210.00939.pdf
+        # of the self-attention guidance paper: https://arxiv.org/pdf/2210.00939.pdf
        # `sag_scale = 0` means no self-attention guidance
        do_self_attention_guidance = sag_scale > 0.0

@@ -667,7 +667,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua

        if timesteps.dtype not in [torch.int16, torch.int32, torch.int64]:
            raise ValueError(
-                f"{self.__class__.__name__} does not support using a scheduler of type {self.scheduler.__class__.__name__}. Please make sure to use one of 'DDIMScheduler, PNDMScheduler, DDPMScheduler, DEISMultistepScheduler, UniPCMultistepScheduler, DPMSolverMultistepScheduler, DPMSolverSinlgestepScheduler'."
+                f"{self.__class__.__name__} does not support using a scheduler of type {self.scheduler.__class__.__name__}. Please make sure to use one of 'DDIMScheduler, PNDMScheduler, DDPMScheduler, DEISMultistepScheduler, UniPCMultistepScheduler, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler'."
            )

        # 5. Prepare latent variables
@@ -723,7 +723,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

-                    # perform self-attention guidance with the stored self-attentnion map
+                    # perform self-attention guidance with the stored self-attention map
                    if do_self_attention_guidance:
                        # classifier-free guidance produces two chunks of attention map
                        # and we only use unconditional one according to equation (25)
@@ -740,20 +740,22 @@ class StableDiffusionXLPipeline(
            self.vae.decoder.mid_block.to(dtype)

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -874,20 +874,22 @@ class StableDiffusionXLImg2ImgPipeline(
            self.vae.decoder.mid_block.to(dtype)

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -1110,20 +1110,22 @@ class StableDiffusionXLInpaintPipeline(
            self.vae.decoder.mid_block.to(dtype)

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -613,20 +613,22 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
        return height, width

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -784,20 +784,22 @@ class StableDiffusionXLAdapterPipeline(
        return height, width

    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
        """
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.

        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        """
        assert len(w.shape) == 1
        w = w * 1000.0
@@ -575,8 +575,8 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"numpy"`):
-                The output format of the generated video. Choose between `"latent"` and `"numpy"`.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `"latent"` and `"np"`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a
                [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput`] instead of
@@ -157,9 +157,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -93,9 +93,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the value of alpha at step 0.
        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, default `epsilon`):
            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
            `v-prediction` is not supported for this scheduler.
@@ -155,9 +155,7 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to 0, otherwise
            it uses the alpha value at step `num_train_timesteps - 1`.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use `num_train_timesteps - 1` for the previous alpha
-            product.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -159,9 +159,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the value of alpha at step 0.
        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, default `epsilon`, optional):
            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
@@ -167,9 +167,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -173,9 +173,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -115,9 +115,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -178,9 +178,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -899,6 +897,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        timestep: int,
        sample: torch.FloatTensor,
        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
        return_dict: bool = True,
    ) -> Union[SchedulerOutput, Tuple]:
        """
@@ -914,6 +913,9 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
                A current instance of a sample created by the diffusion process.
            generator (`torch.Generator`, *optional*):
                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
            return_dict (`bool`):
                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.

@@ -948,11 +950,12 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

        # Upcast to avoid precision issues when computing prev_sample
        sample = sample.to(torch.float32)
-
-        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
            noise = randn_tensor(
                model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
        else:
            noise = None

@@ -134,9 +134,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -792,6 +790,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        timestep: int,
        sample: torch.FloatTensor,
        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
        return_dict: bool = True,
    ) -> Union[SchedulerOutput, Tuple]:
        """
@@ -807,6 +806,9 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
                A current instance of a sample created by the diffusion process.
            generator (`torch.Generator`, *optional*):
                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
            return_dict (`bool`):
                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.

@@ -837,10 +839,12 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
            self.model_outputs[i] = self.model_outputs[i + 1]
        self.model_outputs[-1] = model_output

-        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
            noise = randn_tensor(
                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise
        else:
            noise = None

@@ -153,9 +153,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -223,6 +223,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
        """
        steps = num_inference_steps
        order = self.config.solver_order
+        if order > 3:
+            raise ValueError("Order > 3 is not supported by this scheduler")
        if self.config.lower_order_final:
            if order == 3:
                if steps % 3 == 0:
@@ -156,9 +156,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -162,9 +162,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -101,9 +101,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -99,9 +99,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -98,9 +98,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
patil-suraj	ea238e821b	up	2024-03-18 11:47:47 +01:00
patil-suraj	b6d1d670fc	up	2024-03-18 11:34:17 +01:00
Dhruv Nair	4330a747d4	[Tests] Fix ControlNet Single File tests (#7315 ) * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 11:28:59 +05:30
Sayak Paul	76de6a09fb	post-release v0.27.0 (#7329 ) * post-release * quality	2024-03-18 10:52:20 +05:30
Sayak Paul	25caf24ef9	Fix release workflow deps (#7339 ) * pop scale from the top-level unet instead of getting it. * improve readability. * fix: pypi workflow deps * revert	2024-03-16 07:18:11 +05:30
Abubakar Abid	8db3c9bc9f	Adds docs for `gradio.Interface.from_pipeline()` (#7346 ) * gradio docs * Update docs/source/en/api/pipelines/stable_diffusion/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * changes * changes * changes * Update docs/source/en/api/pipelines/stable_diffusion/overview.md --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-16 07:11:28 +05:30
Sayak Paul	e0e9f81971	add: torch to the pypi step. (#7328 )	2024-03-15 12:28:12 +05:30
M. Tolga Cangöz	5d848ec07c	[`Tests`] Update a deprecated parameter in test files and fix several typos (#7277 ) * Add properties and `IPAdapterTesterMixin` tests for `StableDiffusionPanoramaPipeline` * Fix variable name typo and update comments * Update deprecated `output_type="numpy"` to "np" in test files * Discard changes to src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py * Update test_stable_diffusion_panorama.py * Update numbers in README.md * Update get_guidance_scale_embedding method to use timesteps instead of w * Update number of checkpoints in README.md * Add type hints and fix var name * Fix PyTorch's convention for inplace functions * Fix a typo * Revert "Fix PyTorch's convention for inplace functions" This reverts commit `74350cf65b`. * Fix typos * Indent * Refactor get_guidance_scale_embedding method in LEditsPPPipelineStableDiffusionXL class	2024-03-14 12:17:35 -07:00
Dhruv Nair	4974b84564	Update Cascade Tests (#7324 ) * update * update * update	2024-03-14 20:51:22 +05:30
Linoy Tsaban	83062fb872	[Advanced DreamBooth LoRA SDXL] Support EDM-style training (follow up of #7126 ) (#7182 ) * add edm style training * style * finish adding edm training feature * import fix * fix latents mean * minor adjustments * add edm to readme * style * fix autocast and scheduler config issues when using edm * style --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-14 18:40:14 +05:30
Suraj Patil	b6d7e31d10	add edm schedulers in doc (#7319 ) * add edm schedulers in doc * add in toctree * address reviewe comments	2024-03-14 11:52:25 +01:00
Anatoly Belikov	53e9aacc10	log loss per image (#7278 ) * log loss per image * add commandline param for per image loss logging * style * debug-loss -> debug_loss --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-14 11:41:43 +05:30
Dhruv Nair	41424466e3	[Tests] Fix incorrect constant in VAE scaling test. (#7301 ) update	2024-03-14 10:24:01 +05:30
Sayak Paul	95de1981c9	add: pytest log installation (#7313 )	2024-03-14 10:01:16 +05:30
Kenneth Gerald Hamilton	0b45b58867	update get_order_list if statement (#7309 ) * update get_order_list if statement * revery	2024-03-13 18:29:42 -10:00
Beinsezii	d3986f18be	Change step_offset scheduler docstrings (#7128 ) * Change step_offset scheduler docstrings * Mention it may be needed by some models * More docstrings These ones failed literal S&R because I performed it case-sensitive which is fun. --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 15:12:00 -10:00
Alexander Bonnet	ee6a3a993d	Fix typos in `UNet2DConditionModel` documentation (#7291 ) * fix typo in UNet2DConditionModel documentation * Fix indentation that may fix doc rendering * Fix squished doc lines	2024-03-13 09:31:29 -07:00
Michael	b300517305	Add Intro page of TCD (#7259 ) * add tcd intro * resolve repos * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * revise NFEs related * change inpainting location --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2024-03-13 09:21:51 -07:00
jnhuang	ac07b6dc6a	Fix Wrong Text-encoder Grad Setting in Custom_Diffusion Training (#7302 ) fix index in set textencoder grad Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 20:22:44 +05:30
Sayak Paul	46ab56a468	add: support for notifying maintainers about the nightly test status (#7117 ) * add: support for notifying maintainers about the nightly test status * add: a tempoerary workflow for validation. * cancel in progress. * runs-on * clean up * add: peft dep * change device. * multiple edits. * remove temp workflow.	2024-03-13 16:48:11 +05:30
Sayak Paul	038ff70023	[PyPI publishing] feat: automate the process of pypi publication to some extent. (#7270 ) * feat: automate the process of pypi publication to some extent. * utility to fetch the latest release branch * correct package name.	2024-03-13 16:27:59 +05:30
Manuel Brack	00eca4b887	[Pipeline] Add LEDITS++ pipelines (#6074 ) * Setup LEdits++ file structure * Fix import * LEditsPP Stable Diffusion pipeline * Include variable image aspect ratios * Implement LEDITS++ for SDXL * clean up LEditsPPPipelineStableDiffusion * Adjust inversion output * Added docu, more cleanup for LEditsPPPipelineStableDiffusion * clean up LEditsPPPipelineStableDiffusionXL * Update documentation * Fix documentation import * Add skeleton IF implementation * Fix documentation typo * Add LEDTIS docu to toctree * Add missing title * Finalize SD documentation * Finalize SD-XL documentation * Fix code style and quality * Fix typo * Fix return types * added LEditsPPPipelineIF; minor changes for LEditsPPPipelineStableDiffusion and LEditsPPPipelineStableDiffusionXL * Fix copy reference * add documentation for IF * Add first tests * Fix batching for SD-XL * Fix text encoding and perfect reconstruction for SD-XL * Add tests for SD-XL, minor changes * move user_mask to correct device, use cross_attention_kwargs also for inversion * Example docstring * Fix attention resolution for non-square images * Refactoring for PR review * Safely remove ledits_utils.py * Style fixes * Replace assertions with ValueError * Remove LEditsPPPipelineIF * Remove unecessary input checks * Refactoring of CrossAttnProcessor * Revert unecessary changes to scheduler * Remove first progress-bar in inversion * Refactor scheduler usage and reset * Use imageprocessor instead of custom logic * Fix scheduler init warning * Fix error when running the pipeline in fp16 * Update documentation wrt perfect inversion * Update tests * Fix code quality and copy consistency * Update LEditsPP import * Remove enable/disable methods that are now in StableDiffusionMixin * Change import in docs * Revert import structure change * Fix ledits imports --------- Co-authored-by: Katharina Kornmeier <katharina.kornmeier@stud.tu-darmstadt.de>	2024-03-13 12:43:47 +02:00
Dhruv Nair	30132aba30	Update Stable Cascade Conversion Scripts (#7271 ) * update * update * update * update * update * update * update * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 12:35:44 +05:30