update

add: space for calculating memory usagee. (#7414 )
2024-03-22 10:44:25 +00:00 · 2024-03-22 10:39:51 +00:00 · 2024-03-22 08:43:21 +05:30 · 2024-03-21 10:05:07 -10:00 · 2024-03-21 09:22:06 -07:00 · 2024-03-21 08:33:02 +05:30
309 changed files with 10366 additions and 1385 deletions
@@ -12,6 +12,7 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

 jobs:
  run_nightly_tests:
@@ -64,6 +65,7 @@ jobs:
          python -m uv pip install -e [quality,test]
          python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          python -m uv pip install pytest-reportlog

      - name: Environment
        run: |
@@ -78,7 +80,8 @@ jobs:
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_${{ matrix.config.report }} \
-            tests/
+            --report-log=${{ matrix.config.report }}.log \
+            tests/ 

      - name: Run nightly Flax TPU tests
        if: ${{ matrix.config.framework == 'flax' }}
@@ -89,6 +92,7 @@ jobs:
          python -m pytest -n 0 \
            -s -v -k "Flax" \
            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \
            tests/

      - name: Run nightly ONNXRuntime CUDA tests
@@ -100,6 +104,7 @@ jobs:
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "Onnx" \
            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \ 
            tests/

      - name: Failure short reports
@@ -112,6 +117,12 @@ jobs:
        with:
          name: ${{ matrix.config.report }}_test_reports
          path: reports
+      
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
@@ -140,6 +151,7 @@ jobs:
          ${CONDA_RUN} python -m uv pip install -e [quality,test]
          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          ${CONDA_RUN} python -m uv pip install pytest-reportlog

      - name: Environment
        shell: arch -arch arm64 bash {0}
@@ -152,7 +164,9 @@ jobs:
          HF_HOME: /System/Volumes/Data/mnt/cache
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
-          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
+          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+            --report-log=tests_torch_mps.log \
+            tests/

      - name: Failure short reports
        if: ${{ failure() }}
@@ -164,3 +178,9 @@ jobs:
        with:
          name: torch_mps_test_reports
          path: reports
+
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
@@ -0,0 +1,23 @@
+name: Notify Slack about a release
+
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    
+    - name: Notify Slack about the release
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+      run: pip install requests && python utils/notify_slack_about_release.py
@@ -105,4 +105,4 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_${{ matrix.config.report }} \
-          tests/lora/test_lora_layers_peft.py
+          tests/lora/
@@ -21,10 +21,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
-      options: --shm-size "16gb" --ipc host
+    runs-on: ubuntu-latest
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -32,24 +29,20 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
      - name: Install dependencies
        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-
-      - name: Environment
-        run: |
-          python utils/print_env.py
-
+          pip install -e .
+          pip install huggingface_hub
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
          echo $matrix
          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-
      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -0,0 +1,81 @@
+# Adapted from https://blog.deepjyoti30.dev/pypi-release-github-action
+
+name: PyPI release
+
+on:
+  workflow_dispatch:
+  push:
+    tags:
+      - "*"
+
+jobs:
+  find-and-checkout-latest-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+
+      - name: Fetch latest branch
+        id: fetch_latest_branch
+        run: |
+          pip install -U requests packaging
+          LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py)
+          echo "Latest branch: $LATEST_BRANCH"
+          echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV
+          
+      - name: Set latest branch output
+        id: set_latest_branch
+        run: echo "::set-output name=latest_branch::${{ env.latest_branch }}"
+
+  release:
+    needs: find-and-checkout-latest-branch
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }}
+          
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -U setuptools wheel twine
+          pip install -U torch --index-url https://download.pytorch.org/whl/cpu
+          pip install -U transformers
+      
+      - name: Build the dist files
+        run: python setup.py bdist_wheel && python setup.py sdist
+      
+      - name: Publish to the test PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/    
+
+      - name: Test installing diffusers and importing
+        run: |
+          pip install diffusers && pip uninstall diffusers -y
+          pip install -i https://testpypi.python.org/pypi diffusers
+          python -c "from diffusers import __version__; print(__version__)"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
+          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
+          python -c "from diffusers import *"
+
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: twine upload dist/* -r pypi
@@ -19,6 +19,16 @@ authors:
    family-names: Rasul
  - given-names: Mishig
    family-names: Davaadorj
+  - given-names: Dhruv
+    family-names: Nair
+  - given-names: Sayak
+    family-names: Paul
+  - given-names: Steven
+    family-names: Liu
+  - given-names: William
+    family-names: Berman
+  - given-names: Yiyi
+    family-names: Xu
  - given-names: Thomas
    family-names: Wolf
 repository-code: 'https://github.com/huggingface/diffusers'
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 19000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +8000 other amazing GitHub repositories 💪
+- +9000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and

 ```bibtex
@misc{von-platen-etal-2022-diffusers,
-  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
  title = {Diffusers: State-of-the-art diffusion models},
  year = {2022},
  publisher = {GitHub},
@@ -104,6 +104,8 @@
      title: Latent Consistency Model-LoRA
    - local: using-diffusers/inference_with_lcm
      title: Latent Consistency Model
+    - local: using-diffusers/inference_with_tcd_lora
+      title: Trajectory Consistency Distillation-LoRA
    - local: using-diffusers/svd
      title: Stable Video Diffusion
    title: Specific pipeline examples
@@ -304,6 +306,8 @@
      title: Latent Consistency Models
    - local: api/pipelines/latent_diffusion
      title: Latent Diffusion
+    - local: api/pipelines/ledits_pp
+      title: LEDITS++
    - local: api/pipelines/panorama
      title: MultiDiffusion
    - local: api/pipelines/musicldm
@@ -396,6 +400,10 @@
      title: DPMSolverSDEScheduler
    - local: api/schedulers/singlestep_dpm_solver
      title: DPMSolverSinglestepScheduler
+    - local: api/schedulers/edm_multistep_dpm_solver
+      title: EDMDPMSolverMultistepScheduler
+    - local: api/schedulers/edm_euler
+      title: EDMEulerScheduler
    - local: api/schedulers/euler_ancestral
      title: EulerAncestralDiscreteScheduler
    - local: api/schedulers/euler
@@ -408,6 +408,29 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

+<table>
+    <tr>
+      <th align=center>Without FreeInit enabled</th>
+      <th align=center>With FreeInit enabled</th>
+    </tr>
+    <tr>
+        <td align=center>
+          panda playing a guitar
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
+              alt="panda playing a guitar"
+              style="width: 300px;" />
+        </td>
+        <td align=center>
+          panda playing a guitar
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
+              alt="panda playing a guitar"
+              style="width: 300px;" />
+        </td>
+    </tr>
+</table>
+
 ## Using AnimateLCM

 [AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
@@ -0,0 +1,54 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LEDITS++
+
+LEDITS++ was proposed in [LEDITS++: Limitless Image Editing using Text-to-Image Models](https://huggingface.co/papers/2311.16711) by Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, Apolinário Passos.
+
+The abstract from the paper is:
+
+*Text-to-image diffusion models have recently received increasing interest for their astonishing ability to produce high-fidelity images from solely text inputs. Subsequent research efforts aim to exploit and apply their capabilities to real image editing. However, existing image-to-image methods are often inefficient, imprecise, and of limited versatility. They either require time-consuming fine-tuning, deviate unnecessarily strongly from the input image, and/or lack support for multiple, simultaneous edits. To address these issues, we introduce LEDITS++, an efficient yet versatile and precise textual image manipulation technique. LEDITS++'s novel inversion approach requires no tuning nor optimization and produces high-fidelity results with a few diffusion steps. Second, our methodology supports multiple simultaneous edits and is architecture-agnostic. Third, we use a novel implicit masking technique that limits changes to relevant image regions. We propose the novel TEdBench++ benchmark as part of our exhaustive evaluation. Our results demonstrate the capabilities of LEDITS++ and its improvements over previous methods. The project page is available at https://leditsplusplus-project.static.hf.space .*
+
+<Tip>
+
+You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
+
+</Tip>
+
+<Tip warning={true}>
+Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion. 
+This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp). 
+</Tip>
+
+We provide two distinct pipelines based on different pre-trained models. 
+
+## LEditsPPPipelineStableDiffusion
+[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusion
+	- all
+	- __call__
+	- invert
+
+## LEditsPPPipelineStableDiffusionXL
+[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL
+	- all
+	- __call__
+	- invert
+
+
+
+## LEditsPPDiffusionPipelineOutput
+[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPDiffusionPipelineOutput
+	- all
+
+## LEditsPPInversionPipelineOutput
+[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPInversionPipelineOutput
+	- all
@@ -57,6 +57,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Latent Consistency Models](latent_consistency_models) | text2image |
 | [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
 | [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
+| [LEDITS++](ledits_pp) | image editing |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [Paint by Example](paint_by_example) | inpainting |
@@ -30,6 +30,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

-## StableDiffusionSafePipelineOutput
+## SemanticStableDiffusionPipelineOutput
 [[autodoc]] pipelines.semantic_stable_diffusion.pipeline_output.SemanticStableDiffusionPipelineOutput
 	- all
@@ -12,13 +12,13 @@ specific language governing permissions and limitations under the License.

 # Stable Cascade

-This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main 
-difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this 
-important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes. 
-How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being 
-encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a 
-1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the 
-highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable 
+This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main
+difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this
+important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes.
+How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being
+encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a
+1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the
+highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable
 Diffusion 1.5.

 Therefore, this kind of model is well suited for usages where efficiency is important. Furthermore, all known extensions
@@ -30,13 +30,154 @@ The original codebase can be found at [Stability-AI/StableCascade](https://githu
 Stable Cascade consists of three models: Stage A, Stage B and Stage C, representing a cascade to generate images,
 hence the name "Stable Cascade".

-Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion. 
-However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a 
-spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves 
-a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the 
-image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible 
+Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion.
+However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a
+spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves
+a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the
+image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
 for generating the small 24 x 24 latents given a text prompt.

+The Stage C model operates on the small 24 x 24 latents and denoises the latents conditioned on text prompts. The model is also the largest component in the Cascade pipeline and is meant to be used with the `StableCascadePriorPipeline`
+
+The Stage B and Stage A models are used with the `StableCascadeDecoderPipeline` and are responsible for generating the final image given the small 24 x 24 latents.
+
+<Tip warning={true}>
+
+There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
+
+In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
+
+If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
+
+</Tip>
+
+## Usage example
+
+```python
+import torch
+from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.float16)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings.to(torch.float16),
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=10
+).images[0]
+decoder_output.save("cascade.png")
+```
+
+## Using the Lite Versions of the Stage B and Stage C models
+
+```python
+import torch
+from diffusers import (
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+    StableCascadeUNet,
+)
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior_lite")
+decoder_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder_lite")
+
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=10
+).images[0]
+decoder_output.save("cascade.png")
+```
+
+## Loading original checkpoints with `from_single_file`
+
+Loading the original format checkpoints is supported via `from_single_file` method in the StableCascadeUNet.
+
+```python
+import torch
+from diffusers import (
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+    StableCascadeUNet,
+)
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior_unet = StableCascadeUNet.from_single_file(
+    "https://huggingface.co/stabilityai/stable-cascade/resolve/main/stage_c_bf16.safetensors",
+    torch_dtype=torch.bfloat16
+)
+decoder_unet = StableCascadeUNet.from_single_file(
+    "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors",
+    torch_dtype=torch.bfloat16
+)
+
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=10
+).images[0]
+decoder_output.save("cascade-single-file.png")
+```
+
 ## Uses

 ### Direct Use
@@ -53,7 +194,7 @@ Excluded uses are described below.

 ### Out-of-Scope Use

-The model was not trained to be factual or true representations of people or events, 
+The model was not trained to be factual or true representations of people or events,
 and therefore using the model to generate such content is out-of-scope for the abilities of this model.
 The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).

@@ -172,3 +172,41 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)

 # now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
 ```
+
+### Create web demos using `gradio`
+
+The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
+
+```
+pip install -U gradio
+```
+
+Then, create a web demo around any Stable Diffusion-based pipeline. For example, you can create an image generation pipeline in a single line of code with Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function:
+
+```py
+from diffusers import StableDiffusionPipeline
+import gradio as gr
+
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+which opens an intuitive drag-and-drop interface in your browser:
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gradio-panda.png)
+
+Similarly, you could create a demo for an image-to-image pipeline with:
+
+```py
+from diffusers import StableDiffusionImg2ImgPipeline
+import gradio as gr
+
+
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
+link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link. 
@@ -0,0 +1,22 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# EDMEulerScheduler
+
+The Karras formulation of the Euler scheduler (Algorithm 2) from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
+
+
+## EDMEulerScheduler
+[[autodoc]] EDMEulerScheduler
+
+## EDMEulerSchedulerOutput
+[[autodoc]] schedulers.scheduling_edm_euler.EDMEulerSchedulerOutput
@@ -0,0 +1,24 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# EDMDPMSolverMultistepScheduler
+
+`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistep`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
+
+DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
+samples, and it can generate quite good samples even in 10 steps.
+
+## EDMDPMSolverMultistepScheduler
+[[autodoc]] EDMDPMSolverMultistepScheduler
+
+## SchedulerOutput
+[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
@@ -45,7 +45,7 @@ Make sure to include the token `toy_face` in the prompt and then you can perform
 ```python
 prompt = "toy_face of a hacker with a hoodie"

-lora_scale= 0.9
+lora_scale = 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -114,7 +114,7 @@ To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditio
 pipe.set_adapters("toy")

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale= 0.9
+lora_scale = 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -127,11 +127,12 @@ Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditio
 pipe.disable_lora()

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale= 0.9
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 image
 ```

+![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
+
 ## Manage active adapters

 You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
@@ -239,5 +239,7 @@ pipeline.to("cuda")
 prompt = "柴犬、カラフルアート"

 image = pipeline(prompt=prompt).images[0]
+```

-```
+> [!TIP]
+> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
@@ -0,0 +1,438 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+[[open-in-colab]]
+
+# Trajectory Consistency Distillation-LoRA
+
+Trajectory Consistency Distillation (TCD) enables a model to generate higher quality and more detailed images with fewer steps. Moreover, owing to the effective error mitigation during the distillation process, TCD demonstrates superior performance even under conditions of large inference steps.
+
+The major advantages of TCD are:
+
+- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
+
+- Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
+
+- Freely change detail level: During inference, the level of detail in the image can be adjusted with a single hyperparameter, *gamma*.
+
+> [!TIP]
+> For more technical details of TCD, please refer to the [paper](https://arxiv.org/abs/2402.19159) or official [project page](https://mhh0318.github.io/tcd/)).
+
+For large models like SDXL, TCD is trained with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to reduce memory usage. This is also useful because you can reuse LoRAs between different finetuned models, as long as they share the same base model, without further training.
+
+
+
+This guide will show you how to perform inference with TCD-LoRAs for a variety of tasks like text-to-image and inpainting, as well as how you can easily combine TCD-LoRAs with other adapters. Choose one of the supported base model and it's corresponding TCD-LoRA checkpoint from the table below to get started.
+
+| Base model                                                                                      | TCD-LoRA checkpoint                                            |
+|-------------------------------------------------------------------------------------------------|----------------------------------------------------------------|
+| [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)                  | [TCD-SD15](https://huggingface.co/h1t/TCD-SD15-LoRA)           |
+| [stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)       | [TCD-SD21-base](https://huggingface.co/h1t/TCD-SD21-base-LoRA) |
+| [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | [TCD-SDXL](https://huggingface.co/h1t/TCD-SDXL-LoRA)           |
+
+
+Make sure you have [PEFT](https://github.com/huggingface/peft) installed for better LoRA support.
+
+```bash
+pip install -U peft
+```
+
+## General tasks
+
+In this guide, let's use the [`StableDiffusionXLPipeline`] and the [`TCDScheduler`]. Use the [`~StableDiffusionPipeline.load_lora_weights`] method to load the SDXL-compatible TCD-LoRA weights.
+
+A few tips to keep in mind for TCD-LoRA inference are to:
+
+- Keep the `num_inference_steps` between 4 and 50
+- Set `eta` (used to control stochasticity at each step) between 0 and 1. You should use a higher `eta` when increasing the number of inference steps, but the downside is that a larger `eta` in [`TCDScheduler`] leads to blurrier images. A value of 0.3 is recommended to produce good results.
+
+<hfoptions id="tasks">
+<hfoption id="text-to-image">
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, TCDScheduler
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=4,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/demo_image.png)
+
+</hfoption>
+
+<hfoption id="inpainting">
+
+```python
+import torch
+from diffusers import AutoPipelineForInpainting, TCDScheduler
+from diffusers.utils import load_image, make_image_grid
+
+device = "cuda"
+base_model_id = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = AutoPipelineForInpainting.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = load_image(img_url).resize((1024, 1024))
+mask_image = load_image(mask_url).resize((1024, 1024))
+
+prompt = "a tiger sitting on a park bench"
+
+image = pipe(
+  prompt=prompt,
+  image=init_image,
+  mask_image=mask_image,
+  num_inference_steps=8,
+  guidance_scale=0,
+  eta=0.3,
+  strength=0.99,  # make sure to use `strength` below 1.0
+  generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/inpainting_tcd.png)
+
+
+</hfoption>
+</hfoptions>
+
+## Community models
+
+TCD-LoRA also works with many community finetuned models and plugins. For example, load the [animagine-xl-3.0](https://huggingface.co/cagliostrolab/animagine-xl-3.0) checkpoint which is a community finetuned version of SDXL for generating anime images.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, TCDScheduler
+
+device = "cuda"
+base_model_id = "cagliostrolab/animagine-xl-3.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "A man, clad in a meticulously tailored military uniform, stands with unwavering resolve. The uniform boasts intricate details, and his eyes gleam with determination. Strands of vibrant, windswept hair peek out from beneath the brim of his cap."
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=8,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/animagine_xl.png)
+
+TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. 
+
+> [!TIP]
+> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+styled_lora_id = "TheLastBen/Papercut_SDXL"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id, adapter_name="tcd")
+pipe.load_lora_weights(styled_lora_id, adapter_name="style")
+pipe.set_adapters(["tcd", "style"], adapter_weights=[1.0, 1.0])
+
+prompt = "papercut of a winter mountain, snow"
+
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=4,
+    guidance_scale=0,
+    eta=0.3, 
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/styled_lora.png)
+
+
+## Adapters
+
+TCD-LoRA is very versatile, and it can be combined with other adapter types like ControlNets, IP-Adapter, and AnimateDiff.
+
+<hfoptions id="adapters">
+<hfoption id="ControlNet">
+
+### Depth ControlNet
+
+```python
+import torch
+import numpy as np
+from PIL import Image
+from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
+from diffusers.utils import load_image, make_image_grid
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
+feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+
+def get_depth_map(image):
+    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
+    with torch.no_grad(), torch.autocast(device):
+        depth_map = depth_estimator(image).predicted_depth
+
+    depth_map = torch.nn.functional.interpolate(
+        depth_map.unsqueeze(1),
+        size=(1024, 1024),
+        mode="bicubic",
+        align_corners=False,
+    )
+    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    image = torch.cat([depth_map] * 3, dim=1)
+
+    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+    return image
+
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+controlnet_id = "diffusers/controlnet-depth-sdxl-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+controlnet = ControlNetModel.from_pretrained(
+    controlnet_id,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_id,
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe.enable_model_cpu_offload()
+
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "stormtrooper lecture, photorealistic"
+
+image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
+depth_image = get_depth_map(image)
+
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+image = pipe(
+    prompt, 
+    image=depth_image, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_depth_tcd.png)
+
+### Canny ControlNet
+```python
+import torch
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
+from diffusers.utils import load_image, make_image_grid
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+controlnet_id = "diffusers/controlnet-canny-sdxl-1.0"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+controlnet = ControlNetModel.from_pretrained(
+    controlnet_id,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    base_model_id,
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to(device)
+pipe.enable_model_cpu_offload()
+
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+prompt = "ultrarealistic shot of a furry blue bird"
+
+canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png")
+
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+image = pipe(
+    prompt, 
+    image=canny_image, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    generator=torch.Generator(device=device).manual_seed(0),
+).images[0]
+
+grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
+```
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png)
+
+<Tip>
+The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
+</Tip>
+
+</hfoption>
+<hfoption id="IP-Adapter">
+
+This example shows how to use the TCD-LoRA with the [IP-Adapter](https://github.com/tencent-ailab/IP-Adapter/tree/main) and SDXL.
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers.utils import load_image, make_image_grid
+
+from ip_adapter import IPAdapterXL
+from scheduling_tcd import TCDScheduler 
+
+device = "cuda"
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+image_encoder_path = "sdxl_models/image_encoder"
+ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
+tcd_lora_id = "h1t/TCD-SDXL-LoRA"
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    base_model_path, 
+    torch_dtype=torch.float16, 
+    variant="fp16"
+)
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights(tcd_lora_id)
+pipe.fuse_lora()
+
+ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device)
+
+ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapter/main/assets/images/woman.png").resize((512, 512))
+
+prompt = "best quality, high quality, wearing sunglasses"
+
+image = ip_model.generate(
+    pil_image=ref_image, 
+    prompt=prompt,
+    scale=0.5,
+    num_samples=1, 
+    num_inference_steps=4, 
+    guidance_scale=0,
+    eta=0.3, 
+    seed=0,
+)[0]
+
+grid_image = make_image_grid([ref_image, image], rows=1, cols=2)
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/ip_adapter.png)
+
+
+
+</hfoption>
+<hfoption id="AnimateDiff">
+
+[`AnimateDiff`] allows animating images using Stable Diffusion models. TCD-LoRA can substantially accelerate the process without degrading image quality. The quality of animation with TCD-LoRA and AnimateDiff has a more lucid outcome.
+
+```python
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from scheduling_tcd import TCDScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
+pipe = AnimateDiffPipeline.from_pretrained(
+    "frankjoshua/toonyou_beta6",
+    motion_adapter=adapter,
+).to("cuda")
+
+# set TCDScheduler
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+
+# load TCD LoRA
+pipe.load_lora_weights("h1t/TCD-SD15-LoRA", adapter_name="tcd")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
+
+pipe.set_adapters(["tcd", "motion-lora"], adapter_weights=[1.0, 1.2])
+
+prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
+generator = torch.manual_seed(0)
+frames = pipe(
+    prompt=prompt,
+    num_inference_steps=5,
+    guidance_scale=0,
+    cross_attention_kwargs={"scale": 1},
+    num_frames=24,
+    eta=0.3,
+    generator=generator
+).frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+![](https://github.com/jabir-zheng/TCD/raw/main/assets/animation_example.gif)
+
+</hfoption>
+</hfoptions>
@@ -60,6 +60,23 @@ repo_id = "runwayml/stable-diffusion-v1-5"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 ```

+You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:
+
+<div class="block dark:hidden">
+	<iframe 
+        src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+<div class="hidden dark:block">
+    <iframe 
+        src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+
 ### Local pipeline

 To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
 Before you begin, make sure you have the following libraries installed:

 ```py
-!pip install -q -U diffusers transformers accelerate 
+!pip install -q -U diffusers transformers accelerate
 ```

 The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
 + frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
 ```

-Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
+Using all these tricks together should lower the memory requirement to less than 8GB VRAM.

 ## Micro-conditioning

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # 메모리와 속도

-메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. 
+메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
 일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.

 다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
 | memory-efficient attention | 2.63s  | x3.61   |

 <em>
-   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다. 
+   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
 </em>

 ## cuDNN auto-tuner 활성화하기
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True

 ### fp32 대신 tf32 사용하기  (Ampere 및 이후 CUDA 장치들에서)

-Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. 
-기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. 
-네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. 
-이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. 
-그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다. 
+Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
+기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
+네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
+이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
+그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
 추론하기 전에 다음을 추가하기만 하면 됩니다:

 ```python
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True

 ## 반정밀도 가중치

-더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다. 
+더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
 여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
 ```

 <Tip warning={true}>
-  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다. 
+  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
 </Tip>

 ## 추가 메모리 절약을 위한 슬라이스 어텐션
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )

@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
 또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
 </Tip>

-또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다. 
+또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.


 ```Python
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )

@@ -204,7 +204,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",  
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
 )

@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
-        self.in_channels = pipe.unet.in_channels
+        self.in_channels = pipe.unet.config.in_channels
        self.device = pipe.unet.device

    def forward(self, latent_model_input, t, encoder_hidden_states):
@@ -387,7 +387,7 @@ with torch.inference_mode():
 | A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
 | A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|

-이를 활용하려면 다음을 만족해야 합니다: 
+이를 활용하려면 다음을 만족해야 합니다:
 - PyTorch > 1.12
 - Cuda 사용 가능
 - [xformers 라이브러리를 설치함](xformers)
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다. 
+🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.

 이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.

@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.

 정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.

-위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다. 
+위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.

 모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.

@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di

 ```py
 >>> latents = torch.randn(
-...     (batch_size, unet.in_channels, height // 8, width // 8),
+...     (batch_size, unet.config.in_channels, height // 8, width // 8),
 ...     generator=generator,
 ...     device=torch_device,
 ... )
@@ -259,6 +259,50 @@ pip install git+https://github.com/huggingface/peft.git
 **Inference** 
 The inference is the same as if you train a regular LoRA 🤗

+## Conducting EDM-style training
+
+It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364). 
+
+simply set:
+
+```diff
+  --do_edm_style_training \
+```
+
+Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
+
+```bash
+accelerate launch train_dreambooth_lora_sdxl_advanced.py \
+  --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic"  \
+  --dataset_name="linoyts/3d_icon" \
+  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
+  --output_dir="3d-icon-SDXL-LoRA" \
+  --do_edm_style_training \
+  --caption_column="prompt" \
+  --mixed_precision="bf16" \
+  --resolution=1024 \
+  --train_batch_size=3 \
+  --repeats=1 \
+  --report_to="wandb"\
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1.0 \
+  --text_encoder_lr=1.0 \
+  --optimizer="prodigy"\
+  --train_text_encoder_ti\
+  --train_text_encoder_ti_frac=0.5\
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --rank=8 \
+  --max_train_steps=1000 \
+  --checkpointing_steps=2000 \
+  --seed="0" \
+  --push_to_hub
+```
+
+> [!CAUTION]
+> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".

 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
@@ -70,7 +70,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -1215,7 +1215,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
@@ -1366,14 +1366,14 @@ def main(args):

    # Optimizer creation
    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
-        logger.warn(
+        logger.warning(
            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
            "Defaulting to adamW"
        )
        args.optimizer = "adamw"

    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
-        logger.warn(
+        logger.warning(
            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
            f"set to {args.optimizer.lower()}"
        )
@@ -1407,11 +1407,11 @@ def main(args):
        optimizer_class = prodigyopt.Prodigy

        if args.learning_rate <= 0.1:
-            logger.warn(
+            logger.warning(
                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
            )
        if args.train_text_encoder and args.text_encoder_lr:
-            logger.warn(
+            logger.warning(
                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
                f"When using prodigy only learning_rate is used as the initial learning rate."
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and

 import argparse
+import contextlib
 import gc
 import hashlib
 import itertools
+import json
 import logging
 import math
 import os
@@ -37,7 +39,7 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
@@ -55,6 +57,8 @@ from diffusers import (
    AutoencoderKL,
    DDPMScheduler,
    DPMSolverMultistepScheduler,
+    EDMEulerScheduler,
+    EulerDiscreteScheduler,
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
 )
@@ -74,11 +78,25 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)


+def determine_scheduler_type(pretrained_model_name_or_path, revision):
+    model_index_filename = "model_index.json"
+    if os.path.isdir(pretrained_model_name_or_path):
+        model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
+    else:
+        model_index = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
+        )
+
+    with open(model_index, "r") as f:
+        scheduler_type = json.load(f)["scheduler"][1]
+    return scheduler_type
+
+
 def save_model_card(
    repo_id: str,
    use_dora: bool,
@@ -370,6 +388,11 @@ def parse_args(input_args=None):
            " `args.validation_prompt` multiple times: `args.num_validation_images`."
        ),
    )
+    parser.add_argument(
+        "--do_edm_style_training",
+        action="store_true",
+        help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
+    )
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1117,6 +1140,8 @@ def main(args):
            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
            " Please use `huggingface-cli login` to authenticate with the Hub."
        )
+    if args.do_edm_style_training and args.snr_gamma is not None:
+        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")

    logging_dir = Path(args.output_dir, args.logging_dir)

@@ -1234,7 +1259,19 @@ def main(args):
    )

    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
+    if "EDM" in scheduler_type:
+        args.do_edm_style_training = True
+        noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        logger.info("Performing EDM-style training!")
+    elif args.do_edm_style_training:
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="scheduler"
+        )
+        logger.info("Performing EDM-style training!")
+    else:
+        noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
    text_encoder_one = text_encoder_cls_one.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
    )
@@ -1252,7 +1289,12 @@ def main(args):
        revision=args.revision,
        variant=args.variant,
    )
-    vae_scaling_factor = vae.config.scaling_factor
+    latents_mean = latents_std = None
+    if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
+        latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
+    if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
+        latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
+
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
    )
@@ -1317,7 +1359,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
@@ -1522,14 +1564,14 @@ def main(args):

    # Optimizer creation
    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
-        logger.warn(
+        logger.warning(
            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
            "Defaulting to adamW"
        )
        args.optimizer = "adamw"

    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
-        logger.warn(
+        logger.warning(
            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
            f"set to {args.optimizer.lower()}"
        )
@@ -1563,11 +1605,11 @@ def main(args):
        optimizer_class = prodigyopt.Prodigy

        if args.learning_rate <= 0.1:
-            logger.warn(
+            logger.warning(
                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
            )
        if args.train_text_encoder and args.text_encoder_lr:
-            logger.warn(
+            logger.warning(
                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
                f"When using prodigy only learning_rate is used as the initial learning rate."
@@ -1790,6 +1832,19 @@ def main(args):
        disable=not accelerator.is_local_main_process,
    )

+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        # TODO: revisit other sampling algorithms
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
    if args.train_text_encoder:
        num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
    elif args.train_text_encoder_ti:  # args.train_text_encoder_ti
@@ -1841,9 +1896,15 @@ def main(args):
                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
                    model_input = vae.encode(pixel_values).latent_dist.sample()

-                model_input = model_input * vae_scaling_factor
-                if args.pretrained_vae_model_name_or_path is None:
-                    model_input = model_input.to(weight_dtype)
+                if latents_mean is None and latents_std is None:
+                    model_input = model_input * vae.config.scaling_factor
+                    if args.pretrained_vae_model_name_or_path is None:
+                        model_input = model_input.to(weight_dtype)
+                else:
+                    latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
+                    latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
+                    model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
+                    model_input = model_input.to(dtype=weight_dtype)

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(model_input)
@@ -1854,15 +1915,32 @@ def main(args):
                    )

                bsz = model_input.shape[0]
+
                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                )
-                timesteps = timesteps.long()
+                if not args.do_edm_style_training:
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                else:
+                    # in EDM formulation, the model is conditioned on the pre-conditioned noise levels
+                    # instead of discrete timesteps, so here we sample indices to get the noise levels
+                    # from `scheduler.timesteps`
+                    indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
+                    timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+                # For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
+                # We then precondition the final model inputs based on these sigmas instead of the timesteps.
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                if args.do_edm_style_training:
+                    sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
+                    if "EDM" in scheduler_type:
+                        inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
+                    else:
+                        inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)

                # time ids
                add_time_ids = torch.cat(
@@ -1888,7 +1966,7 @@ def main(args):
                    }
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        noisy_model_input,
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
                        timesteps,
                        prompt_embeds_input,
                        added_cond_kwargs=unet_added_conditions,
@@ -1906,14 +1984,42 @@ def main(args):
                    )
                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
                    model_pred = unet(
-                        noisy_model_input, timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions
+                        inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
+                        timesteps,
+                        prompt_embeds_input,
+                        added_cond_kwargs=unet_added_conditions,
                    ).sample

+                weighting = None
+                if args.do_edm_style_training:
+                    # Similar to the input preconditioning, the model predictions are also preconditioned
+                    # on noised model inputs (before preconditioning) and the sigmas.
+                    # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                    if "EDM" in scheduler_type:
+                        model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
+                    else:
+                        if noise_scheduler.config.prediction_type == "epsilon":
+                            model_pred = model_pred * (-sigmas) + noisy_model_input
+                        elif noise_scheduler.config.prediction_type == "v_prediction":
+                            model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
+                                noisy_model_input / (sigmas**2 + 1)
+                            )
+                    # We are not doing weighting here because it tends result in numerical problems.
+                    # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
+                    # There might be other alternatives for weighting as well:
+                    # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
+                    if "EDM" not in scheduler_type:
+                        weighting = (sigmas**-2.0).float()
+
                # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
+                    target = model_input if args.do_edm_style_training else noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    target = (
+                        model_input
+                        if args.do_edm_style_training
+                        else noise_scheduler.get_velocity(model_input, noise, timesteps)
+                    )
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

@@ -1923,10 +2029,28 @@ def main(args):
                    target, target_prior = torch.chunk(target, 2, dim=0)

                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    if weighting is not None:
+                        prior_loss = torch.mean(
+                            (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                                target_prior.shape[0], -1
+                            ),
+                            1,
+                        )
+                        prior_loss = prior_loss.mean()
+                    else:
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")

                if args.snr_gamma is None:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    if weighting is not None:
+                        loss = torch.mean(
+                            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
+                                target.shape[0], -1
+                            ),
+                            1,
+                        )
+                        loss = loss.mean()
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                else:
                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
@@ -2049,17 +2173,18 @@ def main(args):
                # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
                scheduler_args = {}

-                if "variance_type" in pipeline.scheduler.config:
-                    variance_type = pipeline.scheduler.config.variance_type
+                if not args.do_edm_style_training:
+                    if "variance_type" in pipeline.scheduler.config:
+                        variance_type = pipeline.scheduler.config.variance_type

-                    if variance_type in ["learned", "learned_range"]:
-                        variance_type = "fixed_small"
+                        if variance_type in ["learned", "learned_range"]:
+                            variance_type = "fixed_small"

-                    scheduler_args["variance_type"] = variance_type
+                        scheduler_args["variance_type"] = variance_type

-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    pipeline.scheduler.config, **scheduler_args
-                )
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+                        pipeline.scheduler.config, **scheduler_args
+                    )

                pipeline = pipeline.to(accelerator.device)
                pipeline.set_progress_bar_config(disable=True)
@@ -2067,8 +2192,13 @@ def main(args):
                # run inference
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}
+                inference_ctx = (
+                    contextlib.nullcontext()
+                    if "playground" in args.pretrained_model_name_or_path
+                    else torch.cuda.amp.autocast()
+                )

-                with torch.cuda.amp.autocast():
+                with inference_ctx:
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -2144,15 +2274,18 @@ def main(args):
            # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
            scheduler_args = {}

-            if "variance_type" in pipeline.scheduler.config:
-                variance_type = pipeline.scheduler.config.variance_type
+            if not args.do_edm_style_training:
+                if "variance_type" in pipeline.scheduler.config:
+                    variance_type = pipeline.scheduler.config.variance_type

-                if variance_type in ["learned", "learned_range"]:
-                    variance_type = "fixed_small"
+                    if variance_type in ["learned", "learned_range"]:
+                        variance_type = "fixed_small"

-                scheduler_args["variance_type"] = variance_type
+                    scheduler_args["variance_type"] = variance_type

-            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+                    pipeline.scheduler.config, **scheduler_args
+                )

            # load attention processors
            pipeline.load_lora_weights(args.output_dir)
@@ -1,10 +1,12 @@
-# Community Examples
+# Community Pipeline Examples

 > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**

-**Community** examples consist of both inference and training examples that have been added by the community.
-Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
-If a community doesn't work as expected, please open an issue and ping the author on it.
+**Community pipeline** examples consist pipelines that have been added by the community.
+Please have a look at the following tables to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community pipeline doesn't work as expected, please open an issue and ping the author on it.
+
+Please also check out our [Community Scripts](https://github.com/huggingface/diffusers/blob/main/examples/community/README_community_scripts.md) examples for tips and tricks that you can use with diffusers without having to run a community pipeline.

 | Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
@@ -1887,7 +1889,7 @@ In the above code, the `prompt2` is appended to the `prompt`, which is more than

 For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).

-## Example Images Mixing (with CoCa)
+### Example Images Mixing (with CoCa)
 ```python
 import requests
 from io import BytesIO
@@ -2934,7 +2936,7 @@ pipe(prompt =prompt, rp_args = rp_args)

 The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.

-## Diffusion Posterior Sampling Pipeline
+### Diffusion Posterior Sampling Pipeline
 * Reference paper
    ```
    @article{chung2022diffusion,
@@ -0,0 +1,232 @@
+# Community Scripts
+
+**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community. 
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
+If a community script doesn't work as expected, please open an issue and ping the author on it.
+
+| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
+|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
+| Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
+| asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
+
+
+## Example usages
+
+### IP Adapter Negative Noise
+
+Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
+
+[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
+
+IP-Adapter without negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923_normal](https://github.com/huggingface/diffusers/assets/5442875/3432e25a-ece6-45f4-a3f4-fca354f40b5b)|
+
+IP-Adapter with negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923](https://github.com/huggingface/diffusers/assets/5442875/736fd15a-36ba-40c0-a7d8-6ec1ac26f788)|
+
+```python
+import torch
+
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
+from diffusers.models import ImageProjection
+from diffusers.utils import load_image
+
+
+def encode_image(
+    image_encoder,
+    feature_extractor,
+    image,
+    device,
+    num_images_per_prompt,
+    output_hidden_states=None,
+    negative_image=None,
+):
+    dtype = next(image_encoder.parameters()).dtype
+
+    if not isinstance(image, torch.Tensor):
+        image = feature_extractor(image, return_tensors="pt").pixel_values
+
+    image = image.to(device=device, dtype=dtype)
+    if output_hidden_states:
+        image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
+        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_image is None:
+            uncond_image_enc_hidden_states = image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+        else:
+            if not isinstance(negative_image, torch.Tensor):
+                negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
+            negative_image = negative_image.to(device=device, dtype=dtype)
+            uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
+
+        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_enc_hidden_states, uncond_image_enc_hidden_states
+    else:
+        image_embeds = image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+
+        return image_embeds, uncond_image_embeds
+
+
+@torch.no_grad()
+def prepare_ip_adapter_image_embeds(
+    unet,
+    image_encoder,
+    feature_extractor,
+    ip_adapter_image,
+    do_classifier_free_guidance,
+    device,
+    num_images_per_prompt,
+    ip_adapter_negative_image=None,
+):
+    if not isinstance(ip_adapter_image, list):
+        ip_adapter_image = [ip_adapter_image]
+
+    if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
+        raise ValueError(
+            f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+        )
+
+    image_embeds = []
+    for single_ip_adapter_image, image_proj_layer in zip(
+        ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
+    ):
+        output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+        single_image_embeds, single_negative_image_embeds = encode_image(
+            image_encoder,
+            feature_extractor,
+            single_ip_adapter_image,
+            device,
+            1,
+            output_hidden_state,
+            negative_image=ip_adapter_negative_image,
+        )
+        single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+        single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+            single_image_embeds = single_image_embeds.to(device)
+
+        image_embeds.append(single_image_embeds)
+
+    return image_embeds
+
+
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "RunDiffusion/Juggernaut-XL-v9",
+    torch_dtype=torch.float16,
+    vae=vae,
+    variant="fp16",
+).to("cuda")
+
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+pipeline.scheduler.config.use_karras_sigmas = True
+
+pipeline.load_ip_adapter(
+    "h94/IP-Adapter",
+    subfolder="sdxl_models",
+    weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
+    image_encoder_folder="models/image_encoder",
+)
+pipeline.set_ip_adapter_scale(0.7)
+
+ip_image = load_image("source.png")
+negative_ip_image = load_image("noise.png")
+
+image_embeds = prepare_ip_adapter_image_embeds(
+    unet=pipeline.unet,
+    image_encoder=pipeline.image_encoder,
+    feature_extractor=pipeline.feature_extractor,
+    ip_adapter_image=[[ip_image]],
+    do_classifier_free_guidance=True,
+    device="cuda",
+    num_images_per_prompt=1,
+    ip_adapter_negative_image=negative_ip_image,
+)
+
+
+prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
+negative_prompt = "blurry, smooth, plastic"
+
+image = pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    ip_adapter_image_embeds=image_embeds,
+    guidance_scale=6.0,
+    num_inference_steps=25,
+    generator=torch.Generator(device="cpu").manual_seed(1556265306),
+).images[0]
+
+image.save("result.png")
+```
+
+### Asymmetric Tiling
+Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
+
+
+|Generated|Tiled|
+|---|---|
+|![20240313003235_573631814](https://github.com/huggingface/diffusers/assets/5442875/eca174fb-06a4-464e-a3a7-00dbb024543e)|![wall](https://github.com/huggingface/diffusers/assets/5442875/b4aa774b-2a6a-4316-a8eb-8f30b5f4d024)|
+
+
+```py
+import torch
+from typing import Optional
+from diffusers import StableDiffusionPipeline
+from diffusers.models.lora import LoRACompatibleConv
+
+def seamless_tiling(pipeline, x_axis, y_axis):
+    def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+        self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
+        self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
+        working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
+        working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
+        return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
+    x_mode = 'circular' if x_axis else 'constant'
+    y_mode = 'circular' if y_axis else 'constant'
+    targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
+    convolution_layers = []
+    for target in targets:
+        for module in target.modules():
+            if isinstance(module, torch.nn.Conv2d):
+                convolution_layers.append(module)
+    for layer in convolution_layers:
+        if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
+            layer.lora_layer = lambda * x: 0
+        layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
+    return pipeline
+
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
+pipeline.enable_model_cpu_offload()
+prompt = ["texture of a red brick wall"]
+seed = 123456
+generator = torch.Generator(device='cuda').manual_seed(seed)
+
+pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
+image = pipeline(
+    prompt=prompt,
+    width=512,
+    height=512,
+    num_inference_steps=20,
+    guidance_scale=7,
+    num_images_per_prompt=1,
+    generator=generator
+).images[0]
+seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
+
+torch.cuda.empty_cache()
+image.save('image.png')
+```
@@ -1,7 +1,8 @@
 """
-    modeled after the textual_inversion.py / train_dreambooth.py and the work
-    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+modeled after the textual_inversion.py / train_dreambooth.py and the work
+of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
 """
+
 import inspect
 import warnings
 from typing import List, Optional, Union
@@ -440,7 +440,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -513,9 +513,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -348,7 +348,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -418,9 +418,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -40,7 +40,7 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")


 class MarigoldDepthOutput(BaseOutput):
@@ -206,7 +206,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
            dimensions: ``batch x channels x height x width``.
    """

-    # checkpoint. TOD(Yiyi) - need to clean this up later
+    # checkpoint. #TODO(Yiyi) - need to clean this up later
    if image is None:
        raise ValueError("`image` input cannot be undefined.")

@@ -277,7 +277,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
        # images are in latent space and thus can't
        # be masked set masked_image to None
        # we assume that the checkpoint is not an inpainting
-        # checkpoint. TOD(Yiyi) - need to clean this up later
+        # checkpoint. #TODO(Yiyi) - need to clean this up later
        masked_image = None
    else:
        masked_image = image * (mask < 0.5)
@@ -452,7 +452,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):

                xformers_version = version.parse(xformers.__version__)
                if xformers_version == version.parse("0.0.16"):
-                    logger.warn(
+                    logger.warning(
                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                    )
                self.enable_xformers_memory_efficient_attention()
@@ -81,7 +81,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -171,9 +171,7 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
+            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -1,6 +1,7 @@
 """
-    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 """
+
 import inspect
 from typing import Callable, List, Optional, Union

@@ -224,7 +224,7 @@ class StableDiffusionIPEXPipeline(
        # 5. Prepare latent variables
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
-            self.unet.in_channels,
+            self.unet.config.in_channels,
            height,
            width,
            prompt_embeds.dtype,
@@ -679,7 +679,7 @@ class StableDiffusionIPEXPipeline(
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -917,7 +917,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            text_embeddings = self.__encode_prompt(prompt, negative_prompt)

            # Pre-initialize latents
-            num_channels_latents = self.unet.in_channels
+            num_channels_latents = self.unet.config.in_channels
            latents = self.prepare_latents(
                batch_size,
                num_channels_latents,
@@ -35,7 +35,6 @@ def slerp(val, low, high):


 class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
-
    """
    Pipeline for prompt-to-prompt interpolation on CLIP text embeddings and using the UnCLIP / Dall-E to decode them to images.

@@ -49,7 +48,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -308,7 +308,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

            tracker.log({"validation": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -1068,7 +1068,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -65,7 +65,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -180,7 +180,7 @@ def log_validation(vae, args, accelerator, weight_dtype, step, unet=None, is_fin
            logger_name = "test" if is_final_validation else "validation"
            tracker.log({logger_name: formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -928,7 +928,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -325,7 +325,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

            tracker.log({"validation": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -1083,7 +1083,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -71,7 +71,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -285,7 +285,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

            tracker.log({f"validation/{name}": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -1023,7 +1023,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -77,7 +77,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -303,7 +303,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

            tracker.log({f"validation/{name}": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -1083,7 +1083,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -178,7 +178,7 @@ def log_validation(

            tracker.log({tracker_key: formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -861,7 +861,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -128,7 +128,7 @@ def log_validation(pipeline, pipeline_params, controlnet_params, tokenizer, args

        wandb.log({"validation": formatted_images})
    else:
-        logger.warn(f"image logging not implemented for {args.report_to}")
+        logger.warning(f"image logging not implemented for {args.report_to}")

    return image_logs

@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -178,7 +178,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,

            tracker.log({tracker_key: formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -929,7 +929,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -904,7 +904,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            attention_class = CustomDiffusionXFormersAttnProcessor
@@ -1178,7 +1178,7 @@ def main(args):
                        grads_text_encoder = text_encoder.get_input_embeddings().weight.grad
                    # Get the index for tokens that we want to zero the grads for
                    index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
-                    for i in range(len(modifier_token_id[1:])):
+                    for i in range(1, len(modifier_token_id)):
                        index_grads_to_zero = index_grads_to_zero & (
                            torch.arange(len(tokenizer)) != modifier_token_id[i]
                        )
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -987,7 +987,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -895,7 +895,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -75,7 +75,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -1141,7 +1141,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
@@ -1317,14 +1317,14 @@ def main(args):

    # Optimizer creation
    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
-        logger.warn(
+        logger.warning(
            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
            "Defaulting to adamW"
        )
        args.optimizer = "adamw"

    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
-        logger.warn(
+        logger.warning(
            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
            f"set to {args.optimizer.lower()}"
        )
@@ -1358,11 +1358,11 @@ def main(args):
        optimizer_class = prodigyopt.Prodigy

        if args.learning_rate <= 0.1:
-            logger.warn(
+            logger.warning(
                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
            )
        if args.train_text_encoder and args.text_encoder_lr:
-            logger.warn(
+            logger.warning(
                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
                f"When using prodigy only learning_rate is used as the initial learning rate."
@@ -53,7 +53,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -488,7 +488,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -59,7 +59,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -580,7 +580,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -177,7 +177,7 @@ def log_validation(vae, image_encoder, image_processor, unet, args, accelerator,
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -534,7 +534,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -180,7 +180,7 @@ def log_validation(
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -219,7 +219,7 @@ def log_validation(unet, scheduler, args, accelerator, weight_dtype, step, name=
        if args.num_classes is not None:
            class_labels = list(range(args.num_classes))
        else:
-            logger.warn(
+            logger.warning(
                "The model is class-conditional but the number of classes is not set. The generated images will be"
                " unconditional rather than class-conditional."
            )
@@ -266,7 +266,7 @@ def log_validation(unet, scheduler, args, accelerator, weight_dtype, step, name=

            tracker.log({f"validation/{name}": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    gc.collect()
@@ -863,14 +863,14 @@ def main(args):
    elif args.model_config_name_or_path is None:
        # TODO: use default architectures from iCT paper
        if not args.class_conditional and (args.num_classes is not None or args.class_embed_type is not None):
-            logger.warn(
+            logger.warning(
                f"`--class_conditional` is set to `False` but `--num_classes` is set to {args.num_classes} and"
                f" `--class_embed_type` is set to {args.class_embed_type}. These values will be overridden to `None`."
            )
            args.num_classes = None
            args.class_embed_type = None
        elif args.class_conditional and args.num_classes is None and args.class_embed_type is None:
-            logger.warn(
+            logger.warning(
                "`--class_conditional` is set to `True` but neither `--num_classes` nor `--class_embed_type` is set."
                "`class_conditional` will be overridden to `False`."
            )
@@ -996,7 +996,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -407,7 +407,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step)

            tracker.log({"validation": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -1057,7 +1057,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -574,7 +574,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -672,7 +672,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -516,7 +516,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -608,7 +608,7 @@ def main():
    # Create the pipeline using using the trained modules and save it.
    if accelerator.is_main_process:
        if args.push_to_hub and args.only_save_embeds:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
            save_full_model = True
        else:
            save_full_model = not args.only_save_embeds
@@ -541,7 +541,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -23,6 +23,7 @@ TODO:
 6. Integrate to training x
 7. Test
 """
+
 import copy
 import random

@@ -645,7 +645,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -901,7 +901,7 @@ def main():
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        if args.push_to_hub and args.only_save_embeds:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
            save_full_model = True
        else:
            save_full_model = not args.only_save_embeds
@@ -108,7 +108,7 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -523,7 +523,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -687,7 +687,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -916,7 +916,7 @@ def main():
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        if args.push_to_hub and not args.save_as_full_pipeline:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
            save_full_model = True
        else:
            save_full_model = args.save_as_full_pipeline
@@ -410,7 +410,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            model.enable_xformers_memory_efficient_attention()
@@ -637,7 +637,7 @@ def main(args):
                    generator=generator,
                    batch_size=args.eval_batch_size,
                    num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy",
+                    output_type="np",
                ).images

                if args.use_ema:
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """
+"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""

 import argparse
 import re
@@ -629,7 +629,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -167,7 +167,7 @@ def log_validation(vae, unet, adapter, args, accelerator, weight_dtype, step):

            tracker.log({"validation": formatted_images})
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

        del pipeline
        gc.collect()
@@ -932,7 +932,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -56,7 +56,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -183,7 +183,7 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -608,7 +608,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -49,7 +49,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -52,7 +52,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -497,7 +497,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -64,7 +64,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -425,6 +425,11 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
+    parser.add_argument(
+        "--debug_loss",
+        action="store_true",
+        help="debug loss for each image, if filenames are awailable in the dataset",
+    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -603,6 +608,7 @@ def main(args):
    # Move unet, vae and text_encoder to device and cast to weight_dtype
    # The VAE is in float32 to avoid NaN losses.
    unet.to(accelerator.device, dtype=weight_dtype)
+
    if args.pretrained_vae_model_name_or_path is None:
        vae.to(accelerator.device, dtype=torch.float32)
    else:
@@ -616,7 +622,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -890,13 +896,17 @@ def main(args):
        tokens_one, tokens_two = tokenize_captions(examples)
        examples["input_ids_one"] = tokens_one
        examples["input_ids_two"] = tokens_two
+        if args.debug_loss:
+            fnames = [os.path.basename(image.filename) for image in examples[image_column] if image.filename]
+            if fnames:
+                examples["filenames"] = fnames
        return examples

    with accelerator.main_process_first():
        if args.max_train_samples is not None:
            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)

    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
@@ -905,7 +915,7 @@ def main(args):
        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
-        return {
+        result = {
            "pixel_values": pixel_values,
            "input_ids_one": input_ids_one,
            "input_ids_two": input_ids_two,
@@ -913,6 +923,11 @@ def main(args):
            "crop_top_lefts": crop_top_lefts,
        }

+        filenames = [example["filenames"] for example in examples if "filenames" in example]
+        if filenames:
+            result["filenames"] = filenames
+        return result
+
    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
@@ -1105,7 +1120,9 @@ def main(args):
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                    loss = loss.mean()
-
+                if args.debug_loss and "filenames" in batch:
+                    for fname in batch["filenames"]:
+                        accelerator.log({"loss_for_" + fname: loss}, step=global_step)
                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
                train_loss += avg_loss.item() / args.gradient_accumulation_steps
@@ -54,7 +54,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -712,7 +712,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -911,6 +911,7 @@ def main(args):
        )
        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)

+    del compute_vae_encodings_fn, compute_embeddings_fn, text_encoder_one, text_encoder_two
    del text_encoders, tokenizers, vae
    gc.collect()
    torch.cuda.empty_cache()
@@ -80,7 +80,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -708,7 +708,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -966,7 +966,7 @@ def main():
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        if args.push_to_hub and not args.save_as_full_pipeline:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
            save_full_model = True
        else:
            save_full_model = args.save_as_full_pipeline
@@ -56,7 +56,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = logging.getLogger(__name__)

@@ -76,7 +76,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)

@@ -711,7 +711,7 @@ def main():

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            unet.enable_xformers_memory_efficient_attention()
@@ -1022,7 +1022,7 @@ def main():
            )

        if args.push_to_hub and not args.save_as_full_pipeline:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
            save_full_model = True
        else:
            save_full_model = args.save_as_full_pipeline
@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -408,7 +408,7 @@ def main(args):

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
+                logger.warning(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            model.enable_xformers_memory_efficient_attention()
@@ -648,7 +648,7 @@ def main(args):
                    generator=generator,
                    batch_size=args.eval_batch_size,
                    num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy",
+                    output_type="np",
                ).images

                if args.use_ema:
@@ -50,7 +50,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -184,7 +184,7 @@ def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dty
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
+check_min_version("0.28.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -182,7 +182,7 @@ def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dty
                }
            )
        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
+            logger.warning(f"image logging not implemented for {tracker.name}")

    del pipeline
    torch.cuda.empty_cache()
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+"""Conversion script for the LDM checkpoints."""

 import argparse
 import json
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+"""Conversion script for the LDM checkpoints."""

 import argparse

@@ -1195,9 +1195,9 @@ def superres_check_against_original(dump_path, unet_checkpoint_path):
        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model

    batch_size = 1
-    channels = model.in_channels // 2
-    height = model.sample_size
-    width = model.sample_size
+    channels = model.config.in_channels // 2
+    height = model.config.sample_size
+    width = model.config.sample_size
    height = 1024
    width = 1024

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+"""Conversion script for the LDM checkpoints."""

 import argparse
 import json
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-""" Conversion script for the LoRA's safetensors checkpoints. """
+"""Conversion script for the LoRA's safetensors checkpoints."""

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+"""Conversion script for the LDM checkpoints."""

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the NCSNPP checkpoints. """
+"""Conversion script for the NCSNPP checkpoints."""

 import argparse
 import json
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the AudioLDM2 checkpoints."""
+"""Conversion script for the AudioLDM2 checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the AudioLDM checkpoints."""
+"""Conversion script for the AudioLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """
+"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the MusicLDM checkpoints."""
+"""Conversion script for the MusicLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+"""Conversion script for the LDM checkpoints."""

 import argparse
 import importlib
@@ -1,7 +1,7 @@
 # Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
 import argparse
+from contextlib import nullcontext

-import accelerate
 import torch
 from safetensors.torch import load_file
 from transformers import (
@@ -18,23 +18,56 @@ from diffusers import (
    StableCascadeDecoderPipeline,
    StableCascadePriorPipeline,
 )
+from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
 from diffusers.models import StableCascadeUNet
 from diffusers.models.modeling_utils import load_model_dict_into_meta
 from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils import is_accelerate_available


+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
 parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
-parser.add_argument("--model_path", type=str, default="../StableCascade", help="Location of Stable Cascade weights")
+parser.add_argument("--model_path", type=str, help="Location of Stable Cascade weights")
 parser.add_argument("--stage_c_name", type=str, default="stage_c.safetensors", help="Name of stage c checkpoint file")
 parser.add_argument("--stage_b_name", type=str, default="stage_b.safetensors", help="Name of stage b checkpoint file")
+parser.add_argument("--skip_stage_c", action="store_true", help="Skip converting stage c")
+parser.add_argument("--skip_stage_b", action="store_true", help="Skip converting stage b")
 parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
-parser.add_argument("--save_org", type=str, default="diffusers", help="Hub organization to save the pipelines to")
+parser.add_argument(
+    "--prior_output_path", default="stable-cascade-prior", type=str, help="Hub organization to save the pipelines to"
+)
+parser.add_argument(
+    "--decoder_output_path",
+    type=str,
+    default="stable-cascade-decoder",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--combined_output_path",
+    type=str,
+    default="stable-cascade-combined",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--save_combined", action="store_true")
 parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")

 args = parser.parse_args()
+
+if args.skip_stage_b and args.skip_stage_c:
+    raise ValueError("At least one stage should be converted")
+if (args.skip_stage_b or args.skip_stage_c) and args.save_combined:
+    raise ValueError("Cannot skip stages when creating a combined pipeline")
+
 model_path = args.model_path

 device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32

 # set paths to model weights
 prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
@@ -52,164 +85,134 @@ tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b1
 feature_extractor = CLIPImageProcessor()
 image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")

-# Prior
-if args.use_safetensors:
-    orig_state_dict = load_file(prior_checkpoint_path, device=device)
-else:
-    orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
-
-state_dict = {}
-for key in orig_state_dict.keys():
-    if key.endswith("in_proj_weight"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
-        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
-        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
-    elif key.endswith("in_proj_bias"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
-        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
-        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
-    elif key.endswith("out_proj.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
-    elif key.endswith("out_proj.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
-    else:
-        state_dict[key] = orig_state_dict[key]
-
-
-with accelerate.init_empty_weights():
-    prior_model = StableCascadeUNet(
-        in_channels=16,
-        out_channels=16,
-        timestep_ratio_embedding_dim=64,
-        patch_size=1,
-        conditioning_dim=2048,
-        block_out_channels=[2048, 2048],
-        num_attention_heads=[32, 32],
-        down_num_layers_per_block=[8, 24],
-        up_num_layers_per_block=[24, 8],
-        down_blocks_repeat_mappers=[1, 1],
-        up_blocks_repeat_mappers=[1, 1],
-        block_types_per_layer=[
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-        ],
-        clip_text_in_channels=1280,
-        clip_text_pooled_in_channels=1280,
-        clip_image_in_channels=768,
-        clip_seq=4,
-        kernel_size=3,
-        dropout=[0.1, 0.1],
-        self_attn=True,
-        timestep_conditioning_type=["sca", "crp"],
-        switch_level=[False],
-    )
-load_model_dict_into_meta(prior_model, state_dict)
-
 # scheduler for prior and decoder
 scheduler = DDPMWuerstchenScheduler()
+ctx = init_empty_weights if is_accelerate_available() else nullcontext

-# Prior pipeline
-prior_pipeline = StableCascadePriorPipeline(
-    prior=prior_model,
-    tokenizer=tokenizer,
-    text_encoder=text_encoder,
-    image_encoder=image_encoder,
-    scheduler=scheduler,
-    feature_extractor=feature_extractor,
-)
-prior_pipeline.save_pretrained(f"{args.save_org}/StableCascade-prior", push_to_hub=args.push_to_hub)
-
-# Decoder
-if args.use_safetensors:
-    orig_state_dict = load_file(decoder_checkpoint_path, device=device)
-else:
-    orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
-
-state_dict = {}
-for key in orig_state_dict.keys():
-    if key.endswith("in_proj_weight"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
-        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
-        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
-    elif key.endswith("in_proj_bias"):
-        weights = orig_state_dict[key].chunk(3, 0)
-        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
-        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
-        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
-    elif key.endswith("out_proj.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
-    elif key.endswith("out_proj.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
-    # rename clip_mapper to clip_txt_pooled_mapper
-    elif key.endswith("clip_mapper.weight"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights
-    elif key.endswith("clip_mapper.bias"):
-        weights = orig_state_dict[key]
-        state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights
-    else:
-        state_dict[key] = orig_state_dict[key]
-
-with accelerate.init_empty_weights():
-    decoder = StableCascadeUNet(
-        in_channels=4,
-        out_channels=4,
-        timestep_ratio_embedding_dim=64,
-        patch_size=2,
-        conditioning_dim=1280,
-        block_out_channels=[320, 640, 1280, 1280],
-        down_num_layers_per_block=[2, 6, 28, 6],
-        up_num_layers_per_block=[6, 28, 6, 2],
-        down_blocks_repeat_mappers=[1, 1, 1, 1],
-        up_blocks_repeat_mappers=[3, 3, 2, 2],
-        num_attention_heads=[0, 0, 20, 20],
-        block_types_per_layer=[
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-            ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
-        ],
-        clip_text_pooled_in_channels=1280,
-        clip_seq=4,
-        effnet_in_channels=16,
-        pixel_mapper_in_channels=3,
-        kernel_size=3,
-        dropout=[0, 0, 0.1, 0.1],
-        self_attn=True,
-        timestep_conditioning_type=["sca"],
-    )
-load_model_dict_into_meta(decoder, state_dict)
-
-# VQGAN from Wuerstchen-V2
-vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
-
-# Decoder pipeline
-decoder_pipeline = StableCascadeDecoderPipeline(
-    decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
-)
-decoder_pipeline.save_pretrained(f"{args.save_org}/StableCascade-decoder", push_to_hub=args.push_to_hub)
-
-# Stable Cascade combined pipeline
-stable_cascade_pipeline = StableCascadeCombinedPipeline(
-    # Decoder
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    decoder=decoder,
-    scheduler=scheduler,
-    vqgan=vqmodel,
+if not args.skip_stage_c:
    # Prior
-    prior_text_encoder=text_encoder,
-    prior_tokenizer=tokenizer,
-    prior_prior=prior_model,
-    prior_scheduler=scheduler,
-    prior_image_encoder=image_encoder,
-    prior_feature_extractor=feature_extractor,
-)
-stable_cascade_pipeline.save_pretrained(f"{args.save_org}/StableCascade", push_to_hub=args.push_to_hub)
+    if args.use_safetensors:
+        prior_orig_state_dict = load_file(prior_checkpoint_path, device=device)
+    else:
+        prior_orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
+
+    prior_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(prior_orig_state_dict)
+
+    with ctx():
+        prior_model = StableCascadeUNet(
+            in_channels=16,
+            out_channels=16,
+            timestep_ratio_embedding_dim=64,
+            patch_size=1,
+            conditioning_dim=2048,
+            block_out_channels=[2048, 2048],
+            num_attention_heads=[32, 32],
+            down_num_layers_per_block=[8, 24],
+            up_num_layers_per_block=[24, 8],
+            down_blocks_repeat_mappers=[1, 1],
+            up_blocks_repeat_mappers=[1, 1],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_in_channels=1280,
+            clip_text_pooled_in_channels=1280,
+            clip_image_in_channels=768,
+            clip_seq=4,
+            kernel_size=3,
+            dropout=[0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca", "crp"],
+            switch_level=[False],
+        )
+    if is_accelerate_available():
+        load_model_dict_into_meta(prior_model, prior_state_dict)
+    else:
+        prior_model.load_state_dict(prior_state_dict)
+
+    # Prior pipeline
+    prior_pipeline = StableCascadePriorPipeline(
+        prior=prior_model,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+    )
+    prior_pipeline.to(dtype).save_pretrained(
+        args.prior_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if not args.skip_stage_b:
+    # Decoder
+    if args.use_safetensors:
+        decoder_orig_state_dict = load_file(decoder_checkpoint_path, device=device)
+    else:
+        decoder_orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
+
+    decoder_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(decoder_orig_state_dict)
+    with ctx():
+        decoder = StableCascadeUNet(
+            in_channels=4,
+            out_channels=4,
+            timestep_ratio_embedding_dim=64,
+            patch_size=2,
+            conditioning_dim=1280,
+            block_out_channels=[320, 640, 1280, 1280],
+            down_num_layers_per_block=[2, 6, 28, 6],
+            up_num_layers_per_block=[6, 28, 6, 2],
+            down_blocks_repeat_mappers=[1, 1, 1, 1],
+            up_blocks_repeat_mappers=[3, 3, 2, 2],
+            num_attention_heads=[0, 0, 20, 20],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_pooled_in_channels=1280,
+            clip_seq=4,
+            effnet_in_channels=16,
+            pixel_mapper_in_channels=3,
+            kernel_size=3,
+            dropout=[0, 0, 0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca"],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(decoder, decoder_state_dict)
+    else:
+        decoder.load_state_dict(decoder_state_dict)
+
+    # VQGAN from Wuerstchen-V2
+    vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
+
+    # Decoder pipeline
+    decoder_pipeline = StableCascadeDecoderPipeline(
+        decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
+    )
+    decoder_pipeline.to(dtype).save_pretrained(
+        args.decoder_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if args.save_combined:
+    # Stable Cascade combined pipeline
+    stable_cascade_pipeline = StableCascadeCombinedPipeline(
+        # Decoder
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        decoder=decoder,
+        scheduler=scheduler,
+        vqgan=vqmodel,
+        # Prior
+        prior_text_encoder=text_encoder,
+        prior_tokenizer=tokenizer,
+        prior_prior=prior_model,
+        prior_scheduler=scheduler,
+        prior_image_encoder=image_encoder,
+        prior_feature_extractor=feature_extractor,
+    )
+    stable_cascade_pipeline.to(dtype).save_pretrained(
+        args.combined_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
@@ -0,0 +1,226 @@
+# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
+import argparse
+from contextlib import nullcontext
+
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    AutoTokenizer,
+    CLIPConfig,
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    StableCascadeCombinedPipeline,
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+)
+from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
+from diffusers.models import StableCascadeUNet
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
+parser.add_argument("--model_path", type=str, help="Location of Stable Cascade weights")
+parser.add_argument(
+    "--stage_c_name", type=str, default="stage_c_lite.safetensors", help="Name of stage c checkpoint file"
+)
+parser.add_argument(
+    "--stage_b_name", type=str, default="stage_b_lite.safetensors", help="Name of stage b checkpoint file"
+)
+parser.add_argument("--skip_stage_c", action="store_true", help="Skip converting stage c")
+parser.add_argument("--skip_stage_b", action="store_true", help="Skip converting stage b")
+parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
+parser.add_argument(
+    "--prior_output_path",
+    default="stable-cascade-prior-lite",
+    type=str,
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--decoder_output_path",
+    type=str,
+    default="stable-cascade-decoder-lite",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--combined_output_path",
+    type=str,
+    default="stable-cascade-combined-lite",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--save_combined", action="store_true")
+parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
+
+args = parser.parse_args()
+
+if args.skip_stage_b and args.skip_stage_c:
+    raise ValueError("At least one stage should be converted")
+if (args.skip_stage_b or args.skip_stage_c) and args.save_combined:
+    raise ValueError("Cannot skip stages when creating a combined pipeline")
+
+model_path = args.model_path
+
+device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+
+# set paths to model weights
+prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
+decoder_checkpoint_path = f"{model_path}/{args.stage_b_name}"
+
+# Clip Text encoder and tokenizer
+config = CLIPConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+config.text_config.projection_dim = config.projection_dim
+text_encoder = CLIPTextModelWithProjection.from_pretrained(
+    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", config=config.text_config
+)
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+
+# image processor
+feature_extractor = CLIPImageProcessor()
+image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+# scheduler for prior and decoder
+scheduler = DDPMWuerstchenScheduler()
+
+ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+if not args.skip_stage_c:
+    # Prior
+    if args.use_safetensors:
+        prior_orig_state_dict = load_file(prior_checkpoint_path, device=device)
+    else:
+        prior_orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
+
+    prior_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(prior_orig_state_dict)
+    with ctx():
+        prior_model = StableCascadeUNet(
+            in_channels=16,
+            out_channels=16,
+            timestep_ratio_embedding_dim=64,
+            patch_size=1,
+            conditioning_dim=1536,
+            block_out_channels=[1536, 1536],
+            num_attention_heads=[24, 24],
+            down_num_layers_per_block=[4, 12],
+            up_num_layers_per_block=[12, 4],
+            down_blocks_repeat_mappers=[1, 1],
+            up_blocks_repeat_mappers=[1, 1],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_in_channels=1280,
+            clip_text_pooled_in_channels=1280,
+            clip_image_in_channels=768,
+            clip_seq=4,
+            kernel_size=3,
+            dropout=[0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca", "crp"],
+            switch_level=[False],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(prior_model, prior_state_dict)
+    else:
+        prior_model.load_state_dict(prior_state_dict)
+
+    # Prior pipeline
+    prior_pipeline = StableCascadePriorPipeline(
+        prior=prior_model,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+    )
+    prior_pipeline.to(dtype).save_pretrained(
+        args.prior_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if not args.skip_stage_b:
+    # Decoder
+    if args.use_safetensors:
+        decoder_orig_state_dict = load_file(decoder_checkpoint_path, device=device)
+    else:
+        decoder_orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
+
+    decoder_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(decoder_orig_state_dict)
+
+    with ctx():
+        decoder = StableCascadeUNet(
+            in_channels=4,
+            out_channels=4,
+            timestep_ratio_embedding_dim=64,
+            patch_size=2,
+            conditioning_dim=1280,
+            block_out_channels=[320, 576, 1152, 1152],
+            down_num_layers_per_block=[2, 4, 14, 4],
+            up_num_layers_per_block=[4, 14, 4, 2],
+            down_blocks_repeat_mappers=[1, 1, 1, 1],
+            up_blocks_repeat_mappers=[2, 2, 2, 2],
+            num_attention_heads=[0, 9, 18, 18],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_pooled_in_channels=1280,
+            clip_seq=4,
+            effnet_in_channels=16,
+            pixel_mapper_in_channels=3,
+            kernel_size=3,
+            dropout=[0, 0, 0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca"],
+        )
+
+    if is_accelerate_available():
+        load_model_dict_into_meta(decoder, decoder_state_dict)
+    else:
+        decoder.load_state_dict(decoder_state_dict)
+
+    # VQGAN from Wuerstchen-V2
+    vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
+
+    # Decoder pipeline
+    decoder_pipeline = StableCascadeDecoderPipeline(
+        decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
+    )
+    decoder_pipeline.to(dtype).save_pretrained(
+        args.decoder_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+
+if args.save_combined:
+    # Stable Cascade combined pipeline
+    stable_cascade_pipeline = StableCascadeCombinedPipeline(
+        # Decoder
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        decoder=decoder,
+        scheduler=scheduler,
+        vqgan=vqmodel,
+        # Prior
+        prior_text_encoder=text_encoder,
+        prior_tokenizer=tokenizer,
+        prior_prior=prior_model,
+        prior_scheduler=scheduler,
+        prior_image_encoder=image_encoder,
+        prior_feature_extractor=feature_extractor,
+    )
+    stable_cascade_pipeline.to(dtype).save_pretrained(
+        args.combined_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the Versatile Stable Diffusion checkpoints. """
+"""Conversion script for the Versatile Stable Diffusion checkpoints."""

 import argparse
 from argparse import Namespace
@@ -11,6 +11,7 @@ $ python convert_zero123_to_diffusers.py \
   --original_config_file /path/zero123/configs/sd-objaverse-finetune-c_concat-256.yaml
 ```
 """
+
 import argparse

 import torch
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dhruv Nair	795f2c90fd	update	2024-03-22 10:44:25 +00:00
Dhruv Nair	84e2337807	update	2024-03-22 10:39:51 +00:00
Sayak Paul	9613576191	add: space for calculating memory usagee. (#7414 ) * add: space for calculating memory usahe. * Update docs/source/en/using-diffusers/loading.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2024-03-22 08:43:21 +05:30
YiYi Xu	e4356d6488	add a "Community Scripts" section (#7358 ) * add * add tiling * fix * fix * fix * give community script its own readme * Update examples/community/README_community_scripts.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update examples/community/README_community_scripts.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update examples/community/README_community_scripts.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update examples/community/README_community_scripts.md --------- Co-authored-by: Alexis Rolland <alexis.rolland@ubisoft.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2024-03-21 10:05:07 -10:00
Sayak Paul	82441460ef	[Docs] add missing output image (#7425 ) add missing output image	2024-03-21 09:22:06 -07:00
sayakpaul	3e1097cb63	Revert "add: space within docs to calculate mememory usage." This reverts commit `78990dd960`.	2024-03-21 08:33:02 +05:30
sayakpaul	78990dd960	add: space within docs to calculate mememory usage.	2024-03-21 08:32:37 +05:30
Yuanhao Zhai	405a1facd2	fix: enable unet_3d_condition to support time_cond_proj_dim (#7364 ) Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-21 07:46:32 +05:30
M. Tolga Cangöz	3028089e5e	Fix typos (#7411 ) * Fix typos * Fix typo in SVD.md	2024-03-20 18:46:47 -07:00
Sayak Paul	b536f39818	[Custom Pipelines with Custom Components] fix multiple things (#7304 ) * checking to improve pipelines. * more fixes. * add: tip to encourage the usage of revision * Apply suggestions from code review * retrigger ci --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-03-20 18:49:00 +05:30
Sayak Paul	e25e525fde	[LoRA test suite] refactor the test suite and cleanse it (#7316 ) * cleanse and refactor lora testing suite. * more cleanup. * make check_if_lora_correctly_set a utility function * fix: typo * retrigger ci * style	2024-03-20 17:13:52 +05:30
Sayak Paul	de9adb907c	clean dep installation step in push_tests (#7382 ) * clean dep installation step in push_tests * fix: deps	2024-03-20 07:30:43 +05:30
Sayak Paul	bf861e65dc	[Chore] add: fives names to citations. (#7395 ) * add: four names to citations. * add: steven	2024-03-20 06:37:57 +05:30
Dhruv Nair	4da810b943	Remove insecure `torch.load` calls (#7393 ) update	2024-03-19 12:41:50 -10:00
Stephen	161c6e14b6	Change path to posix (modeling_utils.py) (#6781 ) * Change path to posix * running isort * run style and quality checks --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-19 11:50:34 -10:00
laksjdjf	a6c9015c4e	Fix ControlNetModel.from_unet do not load add_embedding (#7269 ) * Fix ControlNetModel.from_unet do not load add_embedding * delete white space in blank line --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-19 09:45:08 -10:00
PJC	e6a5f99e5c	Update pipeline_controlnet_sd_xl_img2img.py (#7353 ) * Update pipeline_controlnet_sd_xl_img2img.py fix: safetensors load error * fix for pass test --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-03-19 09:29:39 -10:00
Dhruv Nair	80ff4ba63e	Fix issue with prompt embeds and latents in SD Cascade Decoder with multiple image embeddings for a single prompt. (#7381 ) * fix * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-19 07:40:14 -10:00
Sayak Paul	b09a2aa308	[LoRA] fix `cross_attention_kwargs` problems and tighten tests (#7388 ) * debugging * let's see the numbers * let's see the numbers * let's see the numbers * restrict tolerance. * increase inference steps. * shallow copy of cross_attentionkwargs * remove print	2024-03-19 17:53:38 +05:30
YiYi Xu	63b6846849	[scheduler] fix a bug in add_noise (#7386 ) * fix * fix * add a tests * fix --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: yiyixuxu <yixu310@gmail,com>	2024-03-19 00:50:58 -10:00
lawfordp2017	139f707e6e	Correction for non-integral image resolutions with quantizations other than float32 (#7356 ) * Correction for non-integral image resolutions with quantizations other than float32. * Support for training, and use of diffusers-style casting.	2024-03-19 16:17:44 +05:30
Aryan	e4546fd5bb	[docs] Add missing copied from statements in TCD Scheduler (#7360 ) * add missing copied from statements in tcd scheduler * update docstring --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-19 00:45:36 -10:00
Dhruv Nair	d44e31aec2	Add FreeInit Outputs to Docs Page (#7384 ) * update * fix	2024-03-19 14:13:41 +05:30
Sayak Paul	ce9825b56b	[LoRA] pop the LoRA scale so that it doesn't get propagated to the weeds (#7338 ) * pop scale from the top-level unet instead of getting it. * improve readability. * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> * fix a little bit. --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-03-19 09:12:05 +05:30
M. Tolga Cangöz	85f9d92883	Fix conditional statement in test_schedulers.py (#7323 ) Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-19 08:28:47 +05:30
M. Tolga Cangöz	916d9812a8	Update loading of config from a file in test_config.py (#7344 ) Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 11:47:36 -10:00
M. Tolga Cangöz	e6a8492242	Use PyTorch's conventional inplace functions (#7332 ) Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 09:12:15 -10:00
Beinsezii	ad0308b3f1	Add Cascade to Auto T2I + Decoder mappings (#7362 ) * Add Cascade to Auto T2I + Decoder mappings * ruff autofix --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 08:58:20 -10:00
M. Tolga Cangöz	e97a633b63	Update access of configuration attributes (#7343 ) Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 08:53:29 -10:00
Sayak Paul	01ac37b331	[LoRA] Clean Kohya conversion utils (#7374 ) * clean up the kohya_conversion utility * state dict assignment	2024-03-18 06:53:37 -10:00
M. Tolga Cangöz	6a05b274cc	Fix Typos (#7325 ) * Fix PyTorch's convention for inplace functions * Fix import structure in __init__.py and update config loading logic in test_config.py * Update configuration access * Fix typos * Trim trailing white spaces * Fix typo in logger name * Revert "Fix PyTorch's convention for inplace functions" This reverts commit `f65dc4afcb`. * Fix typo in step_index property description * Revert "Update configuration access" This reverts commit `8d44e870b8`. * Revert "Fix import structure in __init__.py and update config loading logic in test_config.py" This reverts commit `2ad5e8bca2`. * Fix typos * Fix typos * Fix typos * Fix a typo: tranform -> transform	2024-03-18 09:48:40 -07:00
Anatoly Belikov	98d46a3f08	delete vae and text encoders after use in SDXL training script (#6693 ) delete vae and text encoders after use Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 20:03:53 +05:30
Dhruv Nair	4330a747d4	[Tests] Fix ControlNet Single File tests (#7315 ) * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-18 11:28:59 +05:30
Sayak Paul	76de6a09fb	post-release v0.27.0 (#7329 ) * post-release * quality	2024-03-18 10:52:20 +05:30
Sayak Paul	25caf24ef9	Fix release workflow deps (#7339 ) * pop scale from the top-level unet instead of getting it. * improve readability. * fix: pypi workflow deps * revert	2024-03-16 07:18:11 +05:30
Abubakar Abid	8db3c9bc9f	Adds docs for `gradio.Interface.from_pipeline()` (#7346 ) * gradio docs * Update docs/source/en/api/pipelines/stable_diffusion/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * changes * changes * changes * Update docs/source/en/api/pipelines/stable_diffusion/overview.md --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-16 07:11:28 +05:30
Sayak Paul	e0e9f81971	add: torch to the pypi step. (#7328 )	2024-03-15 12:28:12 +05:30
M. Tolga Cangöz	5d848ec07c	[`Tests`] Update a deprecated parameter in test files and fix several typos (#7277 ) * Add properties and `IPAdapterTesterMixin` tests for `StableDiffusionPanoramaPipeline` * Fix variable name typo and update comments * Update deprecated `output_type="numpy"` to "np" in test files * Discard changes to src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py * Update test_stable_diffusion_panorama.py * Update numbers in README.md * Update get_guidance_scale_embedding method to use timesteps instead of w * Update number of checkpoints in README.md * Add type hints and fix var name * Fix PyTorch's convention for inplace functions * Fix a typo * Revert "Fix PyTorch's convention for inplace functions" This reverts commit `74350cf65b`. * Fix typos * Indent * Refactor get_guidance_scale_embedding method in LEditsPPPipelineStableDiffusionXL class	2024-03-14 12:17:35 -07:00
Dhruv Nair	4974b84564	Update Cascade Tests (#7324 ) * update * update * update	2024-03-14 20:51:22 +05:30
Linoy Tsaban	83062fb872	[Advanced DreamBooth LoRA SDXL] Support EDM-style training (follow up of #7126 ) (#7182 ) * add edm style training * style * finish adding edm training feature * import fix * fix latents mean * minor adjustments * add edm to readme * style * fix autocast and scheduler config issues when using edm * style --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-14 18:40:14 +05:30
Suraj Patil	b6d7e31d10	add edm schedulers in doc (#7319 ) * add edm schedulers in doc * add in toctree * address reviewe comments	2024-03-14 11:52:25 +01:00
Anatoly Belikov	53e9aacc10	log loss per image (#7278 ) * log loss per image * add commandline param for per image loss logging * style * debug-loss -> debug_loss --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-14 11:41:43 +05:30
Dhruv Nair	41424466e3	[Tests] Fix incorrect constant in VAE scaling test. (#7301 ) update	2024-03-14 10:24:01 +05:30
Sayak Paul	95de1981c9	add: pytest log installation (#7313 )	2024-03-14 10:01:16 +05:30
Kenneth Gerald Hamilton	0b45b58867	update get_order_list if statement (#7309 ) * update get_order_list if statement * revery	2024-03-13 18:29:42 -10:00
Beinsezii	d3986f18be	Change step_offset scheduler docstrings (#7128 ) * Change step_offset scheduler docstrings * Mention it may be needed by some models * More docstrings These ones failed literal S&R because I performed it case-sensitive which is fun. --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 15:12:00 -10:00
Alexander Bonnet	ee6a3a993d	Fix typos in `UNet2DConditionModel` documentation (#7291 ) * fix typo in UNet2DConditionModel documentation * Fix indentation that may fix doc rendering * Fix squished doc lines	2024-03-13 09:31:29 -07:00
Michael	b300517305	Add Intro page of TCD (#7259 ) * add tcd intro * resolve repos * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * revise NFEs related * change inpainting location --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2024-03-13 09:21:51 -07:00
jnhuang	ac07b6dc6a	Fix Wrong Text-encoder Grad Setting in Custom_Diffusion Training (#7302 ) fix index in set textencoder grad Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 20:22:44 +05:30
Sayak Paul	46ab56a468	add: support for notifying maintainers about the nightly test status (#7117 ) * add: support for notifying maintainers about the nightly test status * add: a tempoerary workflow for validation. * cancel in progress. * runs-on * clean up * add: peft dep * change device. * multiple edits. * remove temp workflow.	2024-03-13 16:48:11 +05:30
Sayak Paul	038ff70023	[PyPI publishing] feat: automate the process of pypi publication to some extent. (#7270 ) * feat: automate the process of pypi publication to some extent. * utility to fetch the latest release branch * correct package name.	2024-03-13 16:27:59 +05:30
Manuel Brack	00eca4b887	[Pipeline] Add LEDITS++ pipelines (#6074 ) * Setup LEdits++ file structure * Fix import * LEditsPP Stable Diffusion pipeline * Include variable image aspect ratios * Implement LEDITS++ for SDXL * clean up LEditsPPPipelineStableDiffusion * Adjust inversion output * Added docu, more cleanup for LEditsPPPipelineStableDiffusion * clean up LEditsPPPipelineStableDiffusionXL * Update documentation * Fix documentation import * Add skeleton IF implementation * Fix documentation typo * Add LEDTIS docu to toctree * Add missing title * Finalize SD documentation * Finalize SD-XL documentation * Fix code style and quality * Fix typo * Fix return types * added LEditsPPPipelineIF; minor changes for LEditsPPPipelineStableDiffusion and LEditsPPPipelineStableDiffusionXL * Fix copy reference * add documentation for IF * Add first tests * Fix batching for SD-XL * Fix text encoding and perfect reconstruction for SD-XL * Add tests for SD-XL, minor changes * move user_mask to correct device, use cross_attention_kwargs also for inversion * Example docstring * Fix attention resolution for non-square images * Refactoring for PR review * Safely remove ledits_utils.py * Style fixes * Replace assertions with ValueError * Remove LEditsPPPipelineIF * Remove unecessary input checks * Refactoring of CrossAttnProcessor * Revert unecessary changes to scheduler * Remove first progress-bar in inversion * Refactor scheduler usage and reset * Use imageprocessor instead of custom logic * Fix scheduler init warning * Fix error when running the pipeline in fp16 * Update documentation wrt perfect inversion * Update tests * Fix code quality and copy consistency * Update LEditsPP import * Remove enable/disable methods that are now in StableDiffusionMixin * Change import in docs * Revert import structure change * Fix ledits imports --------- Co-authored-by: Katharina Kornmeier <katharina.kornmeier@stud.tu-darmstadt.de>	2024-03-13 12:43:47 +02:00
Dhruv Nair	30132aba30	Update Stable Cascade Conversion Scripts (#7271 ) * update * update * update * update * update * update * update * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 12:35:44 +05:30
Dhruv Nair	a17d6d6858	Update Cascade documentation (#7257 ) * updates * update * update * Update docs/source/en/api/pipelines/stable_cascade.md Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * update * update * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>	2024-03-13 11:29:59 +05:30
Sayak Paul	8efd9ce787	[Chore] clean residue from copy-pasting in the UNet single file loader (#7295 ) clean residue from copy-pasting	2024-03-13 11:20:13 +05:30
Dhruv Nair	299c16d0f5	Fix loading Img2Img refiner components in `from_single_file` (#7282 ) * update * update * update * update	2024-03-13 09:25:53 +05:30
Dhruv Nair	69f49195ac	Fix passing pooled prompt embeds to Cascade Decoder and Combined Pipeline (#7287 ) * update * update * update * update	2024-03-13 09:21:41 +05:30
Dhruv Nair	ed224f94ba	Add single file support for Stable Cascade (#7274 ) * update * update * update * update * update * update --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-13 08:37:31 +05:30
Sayak Paul	531e719163	[LoRA] use the PyTorch classes wherever needed and start depcrecation cycles (#7204 ) * fix PyTorch classes and start deprecsation cycles. * remove args crafting for accommodating scale. * remove scale check in feedforward. * assert against nn.Linear and not CompatibleLinear. * remove conv_cls and lineaR_cls. * remove scale * 👋 scale. * fix: unet2dcondition * fix attention.py * fix: attention.py again * fix: unet_2d_blocks. * fix-copies. * more fixes. * fix: resnet.py * more fixes * fix i2vgenxl unet. * depcrecate scale gently. * fix-copies * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> * quality * throw warning when scale is passed to the the BasicTransformerBlock class. * remove scale from signature. * cross_attention_kwargs, very nice catch by Yiyi * fix: logger.warn * make deprecation message clearer. * address final comments. * maintain same depcrecation message and also add it to activations. * address yiyi * fix copies * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> * more depcrecation * fix-copies --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-03-13 07:56:19 +05:30
Sayak Paul	4fbd310fd2	[Chore] switch to `logger.warning` (#7289 ) switch to logger.warning	2024-03-13 06:56:43 +05:30
Dhruv Nair	2ea28d69dc	Change `export_to_video` default (#6990 ) update Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-03-12 17:13:12 +05:30