up

2024-03-18 11:47:47 +01:00 · 2024-03-18 11:34:17 +01:00
284 changed files with 3098 additions and 13640 deletions
@@ -12,96 +12,110 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
-  PIPELINE_USAGE_CUTOFF: 5000
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

 jobs:
-  setup_torch_cuda_pipeline_matrix:
-    name: Setup Torch Pipelines Matrix
-    runs-on: ubuntu-latest
-    outputs:
-      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
-      - name: Install dependencies
-        run: |
-          pip install -e .
-          pip install huggingface_hub
-      - name: Fetch Pipeline Matrix
-        id: fetch_pipeline_matrix
-        run: |
-          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
-          echo $matrix
-          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-
-      - name: Pipeline Tests Artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: test-pipelines.json
-          path: reports
-
-  run_nightly_tests_for_torch_pipelines:
-    name: Torch Pipelines CUDA Nightly Tests
-    needs: setup_torch_cuda_pipeline_matrix
+  run_nightly_tests:
    strategy:
      fail-fast: false
      matrix:
-        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+        config:
+          - name: Nightly PyTorch CUDA tests on Ubuntu
+            framework: pytorch
+            runner: docker-gpu
+            image: diffusers/diffusers-pytorch-cuda
+            report: torch_cuda
+          - name: Nightly Flax TPU tests on Ubuntu
+            framework: flax
+            runner: docker-tpu
+            image: diffusers/diffusers-flax-tpu
+            report: flax_tpu
+          - name: Nightly ONNXRuntime CUDA tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-gpu
+            image: diffusers/diffusers-onnxruntime-cuda
+            report: onnx_cuda
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
+
+    defaults:
+      run:
+        shell: bash
+
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
+
      - name: NVIDIA-SMI
-        run: nvidia-smi
-      
+        if: ${{ matrix.config.runner == 'docker-gpu' }}
+        run: |
+          nvidia-smi
+
      - name: Install dependencies
        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
          python -m uv pip install pytest-reportlog
-      
+
      - name: Environment
        run: |
          python utils/print_env.py
-      
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
+
+      - name: Run nightly PyTorch CUDA tests
+        if: ${{ matrix.config.framework == 'pytorch' }}
        env:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
-            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
-            tests/pipelines/${{ matrix.module }}
-      
+            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \
+            tests/ 
+
+      - name: Run nightly Flax TPU tests
+        if: ${{ matrix.config.framework == 'flax' }}
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m pytest -n 0 \
+            -s -v -k "Flax" \
+            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \
+            tests/
+
+      - name: Run nightly ONNXRuntime CUDA tests
+        if: ${{ matrix.config.framework == 'onnxruntime' }}
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "Onnx" \
+            --make-reports=tests_${{ matrix.config.report }} \
+            --report-log=${{ matrix.config.report }}.log \ 
+            tests/
+
      - name: Failure short reports
        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
+        run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: pipeline_${{ matrix.module }}_test_reports
+          name: ${{ matrix.config.report }}_test_reports
          path: reports
      
      - name: Generate Report and Notify Channel
@@ -110,248 +124,6 @@ jobs:
          pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_nightly_tests_for_other_torch_modules:
-    name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: docker-gpu
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        module: [models, schedulers, others, examples]
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install pytest-reportlog
-
-    - name: Environment
-      run: python utils/print_env.py
-
-    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
-      if: ${{ matrix.module != 'examples'}} 
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
-          tests/${{ matrix.module }}
-
-    - name: Run nightly example tests with Torch
-      if: ${{ matrix.module == 'examples' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v --make-reports=examples_torch_cuda \
-          --report-log=examples_torch_cuda.log \ 
-          examples/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
-        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: torch_${{ matrix.module }}_cuda_test_reports
-        path: reports
-
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
-  run_lora_nightly_tests:
-    name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: docker-gpu
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        python -m uv pip install pytest-reportlog
-
-    - name: Environment
-      run: python utils/print_env.py
-
-    - name: Run nightly LoRA tests with PEFT and Torch
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \ 
-          tests/lora
-    
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_lora_cuda_stats.txt 
-        cat reports/tests_torch_lora_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: torch_lora_cuda_test_reports
-        path: reports
-
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
-  run_flax_tpu_tests:
-    name: Nightly Flax TPU Tests
-    runs-on: docker-tpu
-    container:
-      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install pytest-reportlog
-
-    - name: Environment
-      run: python utils/print_env.py
-
-    - name: Run nightly Flax TPU tests
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      run: |
-        python -m pytest -n 0 \
-          -s -v -k "Flax" \
-          --make-reports=tests_flax_tpu \
-          --report-log=tests_flax_tpu.log \ 
-          tests/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_flax_tpu_stats.txt
-        cat reports/tests_flax_tpu_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: flax_tpu_test_reports
-        path: reports
-
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
-  run_nightly_onnx_tests:
-    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: docker-gpu
-    container:
-      image: diffusers/diffusers-onnxruntime-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-    
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: nvidia-smi
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install pytest-reportlog
-
-    - name: Environment
-      run: python utils/print_env.py
-    
-    - name: Run nightly ONNXRuntime CUDA tests
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "Onnx" \
-          --make-reports=tests_onnx_cuda \
-          --report-log=tests_onnx_cuda.log \ 
-          tests/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_onnx_cuda_stats.txt
-        cat reports/tests_onnx_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: ${{ matrix.config.report }}_test_reports
-        path: reports
-    
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]
@@ -35,10 +35,6 @@ jobs:
        run: |
          ruff check examples tests src utils scripts
          ruff format examples tests src utils scripts --check
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY

  check_repository_consistency:
    needs: check_code_quality
@@ -58,10 +54,6 @@ jobs:
          python utils/check_copies.py
          python utils/check_dummies.py
          make deps_table_check_updated
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY

  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
@@ -113,4 +105,4 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_${{ matrix.config.report }} \
-          tests/lora/
+          tests/lora/test_lora_layers_peft.py
@@ -43,10 +43,6 @@ jobs:
        run: |
          ruff check examples tests src utils scripts
          ruff format examples tests src utils scripts --check
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY

  check_repository_consistency:
    needs: check_code_quality
@@ -66,10 +62,6 @@ jobs:
          python utils/check_copies.py
          python utils/check_dummies.py
          make deps_table_check_updated
-      - name: Check if failure
-        if: ${{ failure() }}
-        run: |
-          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY

  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
@@ -21,7 +21,10 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: ubuntu-latest
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
+      options: --shm-size "16gb" --ipc host
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -29,20 +32,24 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
      - name: Install dependencies
        run: |
-          pip install -e .
-          pip install huggingface_hub
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+
+      - name: Environment
+        run: |
+          python utils/print_env.py
+
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
          echo $matrix
          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+
      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -19,16 +19,6 @@ authors:
    family-names: Rasul
  - given-names: Mishig
    family-names: Davaadorj
-  - given-names: Dhruv
-    family-names: Nair
-  - given-names: Sayak
-    family-names: Paul
-  - given-names: Steven
-    family-names: Liu
-  - given-names: William
-    family-names: Berman
-  - given-names: Yiyi
-    family-names: Xu
  - given-names: Thomas
    family-names: Wolf
 repository-code: 'https://github.com/huggingface/diffusers'
@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and

 ```bibtex
@misc{von-platen-etal-2022-diffusers,
-  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
  title = {Diffusers: State-of-the-art diffusion models},
  year = {2022},
  publisher = {GitHub},
@@ -408,29 +408,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

-<table>
-    <tr>
-      <th align=center>Without FreeInit enabled</th>
-      <th align=center>With FreeInit enabled</th>
-    </tr>
-    <tr>
-        <td align=center>
-          panda playing a guitar
-          <br />
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
-              alt="panda playing a guitar"
-              style="width: 300px;" />
-        </td>
-        <td align=center>
-          panda playing a guitar
-          <br/>
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
-              alt="panda playing a guitar"
-              style="width: 300px;" />
-        </td>
-    </tr>
-</table>
-
 ## Using AnimateLCM

 [AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
@@ -88,7 +88,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -54,7 +54,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -84,7 +84,7 @@ Many of the basic parameters are described in the [DreamBooth](dreambooth#script
 - `--freeze_model`: freezes the key and value parameters in the cross-attention layer; the default is `crossattn_kv`, but you can set it to `crossattn` to train all the parameters in the cross-attention layer
 - `--concepts_list`: to learn multiple concepts, provide a path to a JSON file containing the concepts
 - `--modifier_token`: a special word used to represent the learned concept
- `--initializer_token`: a special word used to initialize the embeddings of the `modifier_token`
+- `--initializer_token`:

 ### Prior preservation loss

@@ -67,7 +67,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -180,7 +180,7 @@ elif args.pretrained_model_name_or_path:
        revision=args.revision,
        use_fast=False,
    )
-
+    
 # Load scheduler and models
 noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
 text_encoder = text_encoder_cls.from_pretrained(
@@ -51,7 +51,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -89,7 +89,7 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt

 As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the InstructPix2Pix relevant parts of the script.

-The script begins by modifying the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
+The script begins by modifing the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:

 ```py
 in_channels = 8
@@ -59,7 +59,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -235,7 +235,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-prior-pokemon-model"
+  --output_dir="kandi2-prior-pokemon-model" 
 ```

 </hfoption>
@@ -259,7 +259,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-decoder-pokemon-model"
+  --output_dir="kandi2-decoder-pokemon-model" 
 ```

 </hfoption>
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -252,4 +252,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
 Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:

 - Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
+- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
@@ -59,7 +59,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -69,7 +69,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -67,7 +67,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -51,7 +51,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```py
+```bash
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -173,7 +173,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torc

 caption = "A cute bird pokemon holding a shield"
 images = pipeline(
-    caption,
+    caption, 
    width=1024,
    height=1536,
    prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
@@ -45,7 +45,7 @@ Make sure to include the token `toy_face` in the prompt and then you can perform
 ```python
 prompt = "toy_face of a hacker with a hoodie"

-lora_scale = 0.9
+lora_scale= 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -114,7 +114,7 @@ To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditio
 pipe.set_adapters("toy")

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale = 0.9
+lora_scale= 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -127,68 +127,11 @@ Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditio
 pipe.disable_lora()

 prompt = "toy_face of a hacker with a hoodie"
+lora_scale= 0.9
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 image
 ```

-![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
-
-### Customize adapters strength
-For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`].
-
-For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
-```python
-pipe.enable_lora()  # enable lora again, after we disabled it above
-prompt = "toy_face of a hacker with a hoodie, pixel art"
-adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
-pipe.set_adapters("pixel", adapter_weight_scales)
-image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
-image
-```
-
-![block-lora-text-and-down](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_down.png)
-
-Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
-```python
-adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
-pipe.set_adapters("pixel", adapter_weight_scales)
-image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
-image
-```
-
-![block-lora-text-and-mid](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mid.png)
-
-```python
-adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
-pipe.set_adapters("pixel", adapter_weight_scales)
-image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
-image
-```
-
-![block-lora-text-and-up](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_up.png)
-
-Looks cool!
-
-This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
-```python
-adapter_weight_scales_toy = 0.5
-adapter_weight_scales_pixel = {
-    "unet": {
-        "down": 0.9,  # all transformers in the down-part will use scale 0.9
-        # "mid"  # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
-        "up": {
-            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
-            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
-        }
-    }
-}
-pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
-image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
-image
-```
-
-![block-lora-mixed](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mixed.png)
-
 ## Manage active adapters

 You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
@@ -239,7 +239,5 @@ pipeline.to("cuda")
 prompt = "柴犬、カラフルアート"

 image = pipeline(prompt=prompt).images[0]
-```

-> [!TIP]
-> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
+```
@@ -60,23 +60,6 @@ repo_id = "runwayml/stable-diffusion-v1-5"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 ```

-You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:
-
-<div class="block dark:hidden">
-	<iframe 
-        src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
-        width="850"
-        height="1600"
-    ></iframe>
-</div>
-<div class="hidden dark:block">
-    <iframe 
-        src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
-        width="850"
-        height="1600"
-    ></iframe>
-</div>
-
 ### Local pipeline

 To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
@@ -153,43 +153,18 @@ image
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
 </div>

+<Tip>
+
+For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
+
+</Tip>
+
 To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:

 ```py
 pipeline.unload_lora_weights()
 ```

-### Adjust LoRA weight scale
-
-For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
-
-For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.LoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
-```python
-pipe = ... # create pipeline
-pipe.load_lora_weights(..., adapter_name="my_adapter") 
-scales = {
-    "text_encoder": 0.5,
-    "text_encoder_2": 0.5,  # only usable if pipe has a 2nd text encoder
-    "unet": {
-        "down": 0.9,  # all transformers in the down-part will use scale 0.9
-        # "mid"  # in this example "mid" is not given, therefore all transformers in the mid part will use the default scale 1.0
-        "up": {
-            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
-            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
-        }
-    }
-}
-pipe.set_adapters("my_adapter", scales)
-```
-
-This also works with multiple adapters - see [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength) for how to do it.
-
-<Tip warning={true}>
-
-Currently, [`~loaders.LoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
-
-</Tip>
-
 ### Kohya and TheLastBen

 Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
 Before you begin, make sure you have the following libraries installed:

 ```py
-!pip install -q -U diffusers transformers accelerate
+!pip install -q -U diffusers transformers accelerate 
 ```

 The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
 + frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
 ```

-Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
+Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.

 ## Micro-conditioning

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # 메모리와 속도

-메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
+메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. 
 일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.

 다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
 | memory-efficient attention | 2.63s  | x3.61   |

 <em>
-   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
+   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다. 
 </em>

 ## cuDNN auto-tuner 활성화하기
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True

 ### fp32 대신 tf32 사용하기  (Ampere 및 이후 CUDA 장치들에서)

-Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
-기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
-네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
-이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
-그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
+Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. 
+기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. 
+네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. 
+이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. 
+그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다. 
 추론하기 전에 다음을 추가하기만 하면 됩니다:

 ```python
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True

 ## 반정밀도 가중치

-더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
+더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다. 
 여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
 ```

 <Tip warning={true}>
-  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
+  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다. 
 </Tip>

 ## 추가 메모리 절약을 위한 슬라이스 어텐션
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )

@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
 또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
 </Tip>

-또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
+또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다. 


 ```Python
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )

@@ -204,7 +204,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",  
    torch_dtype=torch.float16,
 )

@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
-        self.in_channels = pipe.unet.config.in_channels
+        self.in_channels = pipe.unet.in_channels
        self.device = pipe.unet.device

    def forward(self, latent_model_input, t, encoder_hidden_states):
@@ -387,7 +387,7 @@ with torch.inference_mode():
 | A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
 | A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|

-이를 활용하려면 다음을 만족해야 합니다:
+이를 활용하려면 다음을 만족해야 합니다: 
 - PyTorch > 1.12
 - Cuda 사용 가능
 - [xformers 라이브러리를 설치함](xformers)
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
+🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다. 

 이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.

@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.

 정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.

-위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
+위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다. 

 모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.

@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di

 ```py
 >>> latents = torch.randn(
-...     (batch_size, unet.config.in_channels, height // 8, width // 8),
+...     (batch_size, unet.in_channels, height // 8, width // 8),
 ...     generator=generator,
 ...     device=torch_device,
 ... )
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 | [**ControlNet**](./controlnet) | ✅ | ✅ | -
 | [**InstructPix2Pix**](./instruct_pix2pix) | ✅ | ✅ | -
-| [**Reinforcement Learning for Control**](./reinforcement_learning)                    | - | - | coming soon.
+| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/reinforcement_learning/run_diffusers_locomotion.py)                    | - | - | coming soon.

 ## Community

@@ -308,6 +308,6 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)

 ## Running on Colab Notebook
-Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb).
+Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_advanced_example.ipynb). 
 to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning)

@@ -656,6 +656,7 @@ def parse_args(input_args=None):
    )
    parser.add_argument(
        "--use_dora",
+        type=bool,
        action="store_true",
        default=False,
        help=(
@@ -1,16 +1,13 @@
-# Community Pipeline Examples
+# Community Examples

 > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**

-**Community pipeline** examples consist pipelines that have been added by the community.
-Please have a look at the following tables to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
-If a community pipeline doesn't work as expected, please open an issue and ping the author on it.
-
-Please also check out our [Community Scripts](https://github.com/huggingface/diffusers/blob/main/examples/community/README_community_scripts.md) examples for tips and tricks that you can use with diffusers without having to run a community pipeline.
+**Community** examples consist of both inference and training examples that have been added by the community.
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community doesn't work as expected, please open an issue and ping the author on it.

 | Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
-| HD-Painter                                                                                                                            | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method.                                                                                                                                                                                                                                                                                                               | [HD-Painter](#hd-painter)                                                                 | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/PAIR/HD-Painter)                                                                              | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) |
 | Marigold Monocular Depth Estimation                                                                                                   | A universal monocular depth estimator, utilizing Stable Diffusion, delivering sharp predictions in the wild. (See the [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) for more details.)                                                                                                                                                                                                                                                        | [Marigold Depth Estimation](#marigold-depth-estimation)                                   | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/toshas/marigold) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12G8reD13DdpMie5ZQlaFNo2WCGeNUH-u?usp=sharing) | [Bingxin Ke](https://github.com/markkua) and [Anton Obukhov](https://github.com/toshas) |
 | LLM-grounded Diffusion (LMD+)                                                                                                         | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion)                             | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SXzMSeAB-LJYISb2yrUOdypLz4OYWUKj) |                [Long (Tony) Lian](https://tonylian.com/) |
 | CLIP Guided Stable Diffusion                                                                                                          | Doing CLIP guidance for text to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                   | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)                             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |                [Suraj Patil](https://github.com/patil-suraj/) |
@@ -76,48 +73,6 @@ pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custo

 ## Example usages

-### HD-Painter
-
-Implementation of [HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image Inpainting with Diffusion Models](https://arxiv.org/abs/2312.14091).
-
-![teaser-img](https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/github/teaser.jpg)
-
-The abstract from the paper is:
-
-Recent progress in text-guided image inpainting, based on the unprecedented success of text-to-image diffusion models, has led to exceptionally realistic and visually plausible results.
-However, there is still significant potential for improvement in current text-to-image inpainting models, particularly in better aligning the inpainted area with user prompts and performing high-resolution inpainting.
-Therefore, in this paper we introduce _HD-Painter_, a completely **training-free** approach that **accurately follows to prompts** and coherently **scales to high-resolution** image inpainting.
-To this end, we design the _Prompt-Aware Introverted Attention (PAIntA)_ layer enhancing self-attention scores by prompt information and resulting in better text alignment generations.
-To further improve the prompt coherence we introduce the _Reweighting Attention Score Guidance (RASG)_ mechanism seamlessly integrating a post-hoc sampling strategy into general form of DDIM to prevent out-of-distribution latent shifts.
-Moreover, HD-Painter allows extension to larger scales by introducing a specialized super-resolution technique customized for inpainting, enabling the completion of missing regions in images of up to 2K resolution. 
-Our experiments demonstrate that HD-Painter surpasses existing state-of-the-art approaches qualitatively and quantitatively, achieving an impressive generation accuracy improvement of **61.4** vs **51.9**. 
-We will make the codes publicly available.
-
-You can find additional information about Text2Video-Zero in the [paper](https://arxiv.org/abs/2312.14091) or the [original codebase](https://github.com/Picsart-AI-Research/HD-Painter).
-
-#### Usage example
-
-```python
-import torch
-from diffusers import DiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-inpainting",
-    custom_pipeline="hd_painter"
-)
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-
-prompt = "wooden boat"
-init_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/images/2.jpg")
-mask_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/masks/2.png")
-
-image = pipe (prompt, init_image, mask_image, use_rasg = True, use_painta = True, generator=torch.manual_seed(12345)).images[0]
-
-make_image_grid([init_image, mask_image, image], rows=1, cols=3)
-
-```
-
 ### Marigold Depth Estimation

 Marigold is a universal monocular depth estimator that delivers accurate and sharp predictions in the wild. Based on Stable Diffusion, it is trained exclusively with synthetic depth data and excels in zero-shot adaptation to real-world imagery. This pipeline is an official implementation of the inference process. More details can be found on our [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) (also implemented with diffusers).
@@ -128,25 +83,14 @@ This depth estimation pipeline processes a single input image through multiple d

 ```python
 import numpy as np
-import torch
 from PIL import Image
 from diffusers import DiffusionPipeline
 from diffusers.utils import load_image

-# Original DDIM version (higher quality)
 pipe = DiffusionPipeline.from_pretrained(
-    "prs-eth/marigold-v1-0",
+    "Bingxin/Marigold",
    custom_pipeline="marigold_depth_estimation"
    # torch_dtype=torch.float16,                # (optional) Run with half-precision (16-bit float).
-    # variant="fp16",                           # (optional) Use with `torch_dtype=torch.float16`, to directly load fp16 checkpoint
-)
-
-# (New) LCM version (faster speed)
-pipe = DiffusionPipeline.from_pretrained(
-    "prs-eth/marigold-lcm-v1-0",
-    custom_pipeline="marigold_depth_estimation"
-    # torch_dtype=torch.float16,                # (optional) Run with half-precision (16-bit float).
-    # variant="fp16",                           # (optional) Use with `torch_dtype=torch.float16`, to directly load fp16 checkpoint
 )

 pipe.to("cuda")
@@ -155,21 +99,12 @@ img_path_or_url = "https://share.phys.ethz.ch/~pf/bingkedata/marigold/pipeline_e
 image: Image.Image = load_image(img_path_or_url)

 pipeline_output = pipe(
-    image,                    # Input image.
-    # ----- recommended setting for DDIM version -----
+    image,                  # Input image.
    # denoising_steps=10,     # (optional) Number of denoising steps of each inference pass. Default: 10.
    # ensemble_size=10,       # (optional) Number of inference passes in the ensemble. Default: 10.
-    # ------------------------------------------------
-    
-    # ----- recommended setting for LCM version ------
-    # denoising_steps=4,
-    # ensemble_size=5,
-    # -------------------------------------------------
-    
    # processing_res=768,     # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
    # match_input_res=True,   # (optional) Resize depth prediction to match input resolution.
    # batch_size=0,           # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
-    # seed=2024,              # (optional) Random seed can be set to ensure additional reproducibility. Default: None (unseeded). Note: forcing --batch_size 1 helps to increase reproducibility. To ensure full reproducibility, deterministic mode needs to be used.
    # color_map="Spectral",   # (optional) Colormap used to colorize the depth map. Defaults to "Spectral". Set to `None` to skip colormap generation.
    # show_progress_bar=True, # (optional) If true, will show progress bars of the inference progress.
 )
@@ -998,7 +933,7 @@ image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
 ### Checkpoint Merger Pipeline
 Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges upto 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format.

-The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM Usage on Kaggle GPU kernels and
+The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect atleast 13GB RAM Usage on Kaggle GPU kernels and
 on colab you might run out of the 12GB memory even while merging two checkpoints.

 Usage:-
@@ -1952,7 +1887,7 @@ In the above code, the `prompt2` is appended to the `prompt`, which is more than

 For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).

-### Example Images Mixing (with CoCa)
+## Example Images Mixing (with CoCa)
 ```python
 import requests
 from io import BytesIO
@@ -2999,7 +2934,7 @@ pipe(prompt =prompt, rp_args = rp_args)

 The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.

-### Diffusion Posterior Sampling Pipeline
+## Diffusion Posterior Sampling Pipeline
 * Reference paper
    ```
    @article{chung2022diffusion,
@@ -3806,80 +3741,3 @@ onestep_image = pipe(prompt, num_inference_steps=1).images[0]
 # Multistep sampling
 multistep_image = pipe(prompt, num_inference_steps=4).images[0]
 ```
-
-# Perturbed-Attention Guidance
-
-[Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance)
-
-This implementation is based on [Diffusers](https://huggingface.co/docs/diffusers/index). StableDiffusionPAGPipeline is a modification of StableDiffusionPipeline to support Perturbed-Attention Guidance (PAG).
-
-## Example Usage
-
-```
-import os
-import torch
-
-from accelerate.utils import set_seed
-
-from diffusers import StableDiffusionPipeline
-from diffusers.utils import load_image, make_image_grid
-from diffusers.utils.torch_utils import randn_tensor
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    custom_pipeline="hyoungwoncho/sd_perturbed_attention_guidance",
-    torch_dtype=torch.float16
-)
-
-device="cuda"
-pipe = pipe.to(device)
-
-pag_scale = 5.0
-pag_applied_layers_index = ['m0']
-
-batch_size = 4
-seed=10
-
-base_dir = "./results/"
-grid_dir = base_dir + "/pag" + str(pag_scale) + "/"
-
-if not os.path.exists(grid_dir):
-    os.makedirs(grid_dir)
-
-set_seed(seed)
-
-latent_input = randn_tensor(shape=(batch_size,4,64,64),generator=None, device=device, dtype=torch.float16)
-
-output_baseline = pipe(
-    "",
-    width=512,
-    height=512,
-    num_inference_steps=50,
-    guidance_scale=0.0,
-    pag_scale=0.0,
-    pag_applied_layers_index=pag_applied_layers_index,
-    num_images_per_prompt=batch_size,
-    latents=latent_input
-).images
-
-output_pag = pipe(
-    "",
-    width=512,
-    height=512,
-    num_inference_steps=50,
-    guidance_scale=0.0,
-    pag_scale=5.0,
-    pag_applied_layers_index=pag_applied_layers_index,
-    num_images_per_prompt=batch_size,
-    latents=latent_input
-).images
-
-grid_image = make_image_grid(output_baseline + output_pag, rows=2, cols=batch_size)
-grid_image.save(grid_dir + "sample.png")
-```
-
-## PAG Parameters
-
-pag_scale : gudiance scale of PAG (ex: 5.0)
-
-pag_applied_layers_index : index of the layer to apply perturbation (ex: ['m0'])
@@ -1,232 +0,0 @@
-# Community Scripts
-
-**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community. 
-Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
-If a community script doesn't work as expected, please open an issue and ping the author on it.
-
-| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
-|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
-| Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
-| asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
-
-
-## Example usages
-
-### IP Adapter Negative Noise
-
-Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
-
-[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
-
-IP-Adapter without negative noise
-|source|result|
-|---|---|
-|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923_normal](https://github.com/huggingface/diffusers/assets/5442875/3432e25a-ece6-45f4-a3f4-fca354f40b5b)|
-
-IP-Adapter with negative noise
-|source|result|
-|---|---|
-|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923](https://github.com/huggingface/diffusers/assets/5442875/736fd15a-36ba-40c0-a7d8-6ec1ac26f788)|
-
-```python
-import torch
-
-from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
-from diffusers.models import ImageProjection
-from diffusers.utils import load_image
-
-
-def encode_image(
-    image_encoder,
-    feature_extractor,
-    image,
-    device,
-    num_images_per_prompt,
-    output_hidden_states=None,
-    negative_image=None,
-):
-    dtype = next(image_encoder.parameters()).dtype
-
-    if not isinstance(image, torch.Tensor):
-        image = feature_extractor(image, return_tensors="pt").pixel_values
-
-    image = image.to(device=device, dtype=dtype)
-    if output_hidden_states:
-        image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
-        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if negative_image is None:
-            uncond_image_enc_hidden_states = image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-        else:
-            if not isinstance(negative_image, torch.Tensor):
-                negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
-            negative_image = negative_image.to(device=device, dtype=dtype)
-            uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
-
-        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        return image_enc_hidden_states, uncond_image_enc_hidden_states
-    else:
-        image_embeds = image_encoder(image).image_embeds
-        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        uncond_image_embeds = torch.zeros_like(image_embeds)
-
-        return image_embeds, uncond_image_embeds
-
-
-@torch.no_grad()
-def prepare_ip_adapter_image_embeds(
-    unet,
-    image_encoder,
-    feature_extractor,
-    ip_adapter_image,
-    do_classifier_free_guidance,
-    device,
-    num_images_per_prompt,
-    ip_adapter_negative_image=None,
-):
-    if not isinstance(ip_adapter_image, list):
-        ip_adapter_image = [ip_adapter_image]
-
-    if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
-        raise ValueError(
-            f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-        )
-
-    image_embeds = []
-    for single_ip_adapter_image, image_proj_layer in zip(
-        ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
-    ):
-        output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-        single_image_embeds, single_negative_image_embeds = encode_image(
-            image_encoder,
-            feature_extractor,
-            single_ip_adapter_image,
-            device,
-            1,
-            output_hidden_state,
-            negative_image=ip_adapter_negative_image,
-        )
-        single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-        single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
-
-        if do_classifier_free_guidance:
-            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-            single_image_embeds = single_image_embeds.to(device)
-
-        image_embeds.append(single_image_embeds)
-
-    return image_embeds
-
-
-vae = AutoencoderKL.from_pretrained(
-    "madebyollin/sdxl-vae-fp16-fix",
-    torch_dtype=torch.float16,
-).to("cuda")
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "RunDiffusion/Juggernaut-XL-v9",
-    torch_dtype=torch.float16,
-    vae=vae,
-    variant="fp16",
-).to("cuda")
-
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-pipeline.scheduler.config.use_karras_sigmas = True
-
-pipeline.load_ip_adapter(
-    "h94/IP-Adapter",
-    subfolder="sdxl_models",
-    weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
-    image_encoder_folder="models/image_encoder",
-)
-pipeline.set_ip_adapter_scale(0.7)
-
-ip_image = load_image("source.png")
-negative_ip_image = load_image("noise.png")
-
-image_embeds = prepare_ip_adapter_image_embeds(
-    unet=pipeline.unet,
-    image_encoder=pipeline.image_encoder,
-    feature_extractor=pipeline.feature_extractor,
-    ip_adapter_image=[[ip_image]],
-    do_classifier_free_guidance=True,
-    device="cuda",
-    num_images_per_prompt=1,
-    ip_adapter_negative_image=negative_ip_image,
-)
-
-
-prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
-negative_prompt = "blurry, smooth, plastic"
-
-image = pipeline(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    ip_adapter_image_embeds=image_embeds,
-    guidance_scale=6.0,
-    num_inference_steps=25,
-    generator=torch.Generator(device="cpu").manual_seed(1556265306),
-).images[0]
-
-image.save("result.png")
-```
-
-### Asymmetric Tiling
-Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
-
-
-|Generated|Tiled|
-|---|---|
-|![20240313003235_573631814](https://github.com/huggingface/diffusers/assets/5442875/eca174fb-06a4-464e-a3a7-00dbb024543e)|![wall](https://github.com/huggingface/diffusers/assets/5442875/b4aa774b-2a6a-4316-a8eb-8f30b5f4d024)|
-
-
-```py
-import torch
-from typing import Optional
-from diffusers import StableDiffusionPipeline
-from diffusers.models.lora import LoRACompatibleConv
-
-def seamless_tiling(pipeline, x_axis, y_axis):
-    def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
-        self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
-        self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
-        working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
-        working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
-        return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
-    x_mode = 'circular' if x_axis else 'constant'
-    y_mode = 'circular' if y_axis else 'constant'
-    targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
-    convolution_layers = []
-    for target in targets:
-        for module in target.modules():
-            if isinstance(module, torch.nn.Conv2d):
-                convolution_layers.append(module)
-    for layer in convolution_layers:
-        if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
-            layer.lora_layer = lambda * x: 0
-        layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
-    return pipeline
-
-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
-pipeline.enable_model_cpu_offload()
-prompt = ["texture of a red brick wall"]
-seed = 123456
-generator = torch.Generator(device='cuda').manual_seed(seed)
-
-pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
-image = pipeline(
-    prompt=prompt,
-    width=512,
-    height=512,
-    num_inference_steps=20,
-    guidance_scale=7,
-    num_images_per_prompt=1,
-    generator=generator
-).images[0]
-seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
-
-torch.cuda.empty_cache()
-image.save('image.png')
-```
@@ -103,7 +103,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        print(f"Combining with alpha={alpha}, interpolation mode={interp}")

        checkpoint_count = len(pretrained_model_name_or_path_list)
-        # Ignore result from model_index_json comparison of the two checkpoints
+        # Ignore result from model_index_json comparision of the two checkpoints
        force = kwargs.pop("force", False)

        # If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
@@ -217,7 +217,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                        ]
                        checkpoint_path_2 = files[0] if len(files) > 0 else None
                # For an attr if both checkpoint_path_1 and 2 are None, ignore.
-                # If at least one is present, deal with it according to interp method, of course only if the state_dict keys match.
+                # If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
                if checkpoint_path_1 is None and checkpoint_path_2 is None:
                    print(f"Skipping {attr}: not present in 2nd or 3d model")
                    continue
@@ -1,994 +0,0 @@
-import math
-import numbers
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AsymmetricAutoencoderKL, ImageProjection
-from diffusers.models.attention_processor import Attention, AttnProcessor
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
-    StableDiffusionInpaintPipeline,
-    retrieve_timesteps,
-)
-from diffusers.utils import deprecate
-
-
-class RASGAttnProcessor:
-    def __init__(self, mask, token_idx, scale_factor):
-        self.attention_scores = None  # Stores the last output of the similarity matrix here. Each layer will get its own RASGAttnProcessor assigned
-        self.mask = mask
-        self.token_idx = token_idx
-        self.scale_factor = scale_factor
-        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64 if the image is 512x512
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-    ) -> torch.Tensor:
-        # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
-        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        # Automatically recognize the resolution and save the attention similarity values
-        # We need to use the values before the softmax function, hence the rewritten get_attention_scores function.
-        if downscale_factor == self.scale_factor**2:
-            self.attention_scores = get_attention_scores(attn, query, key, attention_mask)
-            attention_probs = self.attention_scores.softmax(dim=-1)
-            attention_probs = attention_probs.to(query.dtype)
-        else:
-            attention_probs = attn.get_attention_scores(query, key, attention_mask)  # Original code
-
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class PAIntAAttnProcessor:
-    def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidance, scale_factors):
-        self.transformer_block = transformer_block  # Stores the parent transformer block.
-        self.mask = mask
-        self.scale_factors = scale_factors
-        self.do_classifier_free_guidance = do_classifier_free_guidance
-        self.token_idx = token_idx
-        self.shape = mask.shape[2:]
-        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64
-        self.default_processor = AttnProcessor()
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-    ) -> torch.Tensor:
-        # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
-        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
-
-        mask = None
-        for factor in self.scale_factors:
-            if downscale_factor == factor**2:
-                shape = (self.shape[0] // factor, self.shape[1] // factor)
-                mask = F.interpolate(self.mask, shape, mode="bicubic")  # B, 1, H, W
-                break
-        if mask is None:
-            return self.default_processor(attn, hidden_states, encoder_hidden_states, attention_mask, temb, scale)
-
-        # STARTS HERE
-        residual = hidden_states
-        # Save the input hidden_states for later use
-        input_hidden_states = hidden_states
-
-        # ================================================== #
-        # =============== SELF ATTENTION 1 ================= #
-        # ================================================== #
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        # self_attention_probs = attn.get_attention_scores(query, key, attention_mask) # We can't use post-softmax attention scores in this case
-        self_attention_scores = get_attention_scores(
-            attn, query, key, attention_mask
-        )  # The custom function returns pre-softmax probabilities
-        self_attention_probs = self_attention_scores.softmax(
-            dim=-1
-        )  # Manually compute the probabilities here, the scores will be reused in the second part of PAIntA
-        self_attention_probs = self_attention_probs.to(query.dtype)
-
-        hidden_states = torch.bmm(self_attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        # x = x + self.attn1(self.norm1(x))
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:  # So many residuals everywhere
-            hidden_states = hidden_states + residual
-
-        self_attention_output_hidden_states = hidden_states / attn.rescale_output_factor
-
-        # ================================================== #
-        # ============ BasicTransformerBlock =============== #
-        # ================================================== #
-        # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
-        # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
-        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
-
-        # The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
-        # But the residual of the output is the non-normalized version.
-        # Therefore we unnormalize the input hidden state here
-        unnormalized_input_hidden_states = (
-            input_hidden_states + self.transformer_block.norm1.bias
-        ) * self.transformer_block.norm1.weight
-
-        # TODO: return if neccessary
-        # if self.use_ada_layer_norm_zero:
-        #     attn_output = gate_msa.unsqueeze(1) * attn_output
-        # elif self.use_ada_layer_norm_single:
-        #     attn_output = gate_msa * attn_output
-
-        transformer_hidden_states = self_attention_output_hidden_states + unnormalized_input_hidden_states
-        if transformer_hidden_states.ndim == 4:
-            transformer_hidden_states = transformer_hidden_states.squeeze(1)
-
-        # TODO: return if neccessary
-        # 2.5 GLIGEN Control
-        # if gligen_kwargs is not None:
-        #     transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
-        # NOTE: we experimented with using GLIGEN and HDPainter together, the results were not that great
-
-        # 3. Cross-Attention
-        if self.transformer_block.use_ada_layer_norm:
-            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, timestep)
-            raise NotImplementedError()
-        elif self.transformer_block.use_ada_layer_norm_zero or self.transformer_block.use_layer_norm:
-            transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states)
-        elif self.transformer_block.use_ada_layer_norm_single:
-            # For PixArt norm2 isn't applied here:
-            # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
-            transformer_norm_hidden_states = transformer_hidden_states
-        elif self.transformer_block.use_ada_layer_norm_continuous:
-            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, added_cond_kwargs["pooled_text_emb"])
-            raise NotImplementedError()
-        else:
-            raise ValueError("Incorrect norm")
-
-        if self.transformer_block.pos_embed is not None and self.transformer_block.use_ada_layer_norm_single is False:
-            transformer_norm_hidden_states = self.transformer_block.pos_embed(transformer_norm_hidden_states)
-
-        # ================================================== #
-        # ================= CROSS ATTENTION ================ #
-        # ================================================== #
-
-        # We do an initial pass of the CrossAttention up to obtaining the similarity matrix here.
-        # The similarity matrix is used to obtain scaling coefficients for the attention matrix of the self attention
-        # We reuse the previously computed self-attention matrix, and only repeat the steps after the softmax
-
-        cross_attention_input_hidden_states = (
-            transformer_norm_hidden_states  # Renaming the variable for the sake of readability
-        )
-
-        # TODO: check if classifier_free_guidance is being used before splitting here
-        if self.do_classifier_free_guidance:
-            # Our scaling coefficients depend only on the conditional part, so we split the inputs
-            (
-                _cross_attention_input_hidden_states_unconditional,
-                cross_attention_input_hidden_states_conditional,
-            ) = cross_attention_input_hidden_states.chunk(2)
-
-            # Same split for the encoder_hidden_states i.e. the tokens
-            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
-            _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
-                2
-            )
-        else:
-            cross_attention_input_hidden_states_conditional = cross_attention_input_hidden_states
-            encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(2)
-
-        # Rename the variables for the sake of readability
-        # The part below is the beginning of the __call__ function of the following CrossAttention layer
-        cross_attention_hidden_states = cross_attention_input_hidden_states_conditional
-        cross_attention_encoder_hidden_states = encoder_hidden_states_conditional
-
-        attn2 = self.transformer_block.attn2
-
-        if attn2.spatial_norm is not None:
-            cross_attention_hidden_states = attn2.spatial_norm(cross_attention_hidden_states, temb)
-
-        input_ndim = cross_attention_hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = cross_attention_hidden_states.shape
-            cross_attention_hidden_states = cross_attention_hidden_states.view(
-                batch_size, channel, height * width
-            ).transpose(1, 2)
-
-        (
-            batch_size,
-            sequence_length,
-            _,
-        ) = cross_attention_hidden_states.shape  # It is definitely a cross attention, so no need for an if block
-        # TODO: change the attention_mask here
-        attention_mask = attn2.prepare_attention_mask(
-            None, sequence_length, batch_size
-        )  # I assume the attention mask is the same...
-
-        if attn2.group_norm is not None:
-            cross_attention_hidden_states = attn2.group_norm(cross_attention_hidden_states.transpose(1, 2)).transpose(
-                1, 2
-            )
-
-        query2 = attn2.to_q(cross_attention_hidden_states)
-
-        if attn2.norm_cross:
-            cross_attention_encoder_hidden_states = attn2.norm_encoder_hidden_states(
-                cross_attention_encoder_hidden_states
-            )
-
-        key2 = attn2.to_k(cross_attention_encoder_hidden_states)
-        query2 = attn2.head_to_batch_dim(query2)
-        key2 = attn2.head_to_batch_dim(key2)
-
-        cross_attention_probs = attn2.get_attention_scores(query2, key2, attention_mask)
-
-        # CrossAttention ends here, the remaining part is not used
-
-        # ================================================== #
-        # ================ SELF ATTENTION 2 ================ #
-        # ================================================== #
-        # DEJA VU!
-
-        mask = (mask > 0.5).to(self_attention_output_hidden_states.dtype)
-        m = mask.to(self_attention_output_hidden_states.device)
-        # m = rearrange(m, 'b c h w -> b (h w) c').contiguous()
-        m = m.permute(0, 2, 3, 1).reshape((m.shape[0], -1, m.shape[1])).contiguous()  # B HW 1
-        m = torch.matmul(m, m.permute(0, 2, 1)) + (1 - m)
-
-        # # Compute scaling coefficients for the similarity matrix
-        # # Select the cross attention values for the correct tokens only!
-        # cross_attention_probs = cross_attention_probs.mean(dim = 0)
-        # cross_attention_probs = cross_attention_probs[:, self.token_idx].sum(dim=1)
-
-        # cross_attention_probs = cross_attention_probs.reshape(shape)
-        # gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(self_attention_output_hidden_states.device)
-        # cross_attention_probs = gaussian_smoothing(cross_attention_probs.unsqueeze(0))[0] # optional smoothing
-        # cross_attention_probs = cross_attention_probs.reshape(-1)
-        # cross_attention_probs = ((cross_attention_probs - torch.median(cross_attention_probs.ravel())) / torch.max(cross_attention_probs.ravel())).clip(0, 1)
-
-        # c = (1 - m) * cross_attention_probs.reshape(1, 1, -1) + m # PAIntA scaling coefficients
-
-        # Compute scaling coefficients for the similarity matrix
-        # Select the cross attention values for the correct tokens only!
-
-        batch_size, dims, channels = cross_attention_probs.shape
-        batch_size = batch_size // attn.heads
-        cross_attention_probs = cross_attention_probs.reshape((batch_size, attn.heads, dims, channels))  # B, D, HW, T
-
-        cross_attention_probs = cross_attention_probs.mean(dim=1)  # B, HW, T
-        cross_attention_probs = cross_attention_probs[..., self.token_idx].sum(dim=-1)  # B, HW
-        cross_attention_probs = cross_attention_probs.reshape((batch_size,) + shape)  # , B, H, W
-
-        gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(
-            self_attention_output_hidden_states.device
-        )
-        cross_attention_probs = gaussian_smoothing(cross_attention_probs[:, None])[:, 0]  # optional smoothing B, H, W
-
-        # Median normalization
-        cross_attention_probs = cross_attention_probs.reshape(batch_size, -1)  # B, HW
-        cross_attention_probs = (
-            cross_attention_probs - cross_attention_probs.median(dim=-1, keepdim=True).values
-        ) / cross_attention_probs.max(dim=-1, keepdim=True).values
-        cross_attention_probs = cross_attention_probs.clip(0, 1)
-
-        c = (1 - m) * cross_attention_probs.reshape(batch_size, 1, -1) + m
-        c = c.repeat_interleave(attn.heads, 0)  # BD, HW
-        if self.do_classifier_free_guidance:
-            c = torch.cat([c, c])  # 2BD, HW
-
-        # Rescaling the original self-attention matrix
-        self_attention_scores_rescaled = self_attention_scores * c
-        self_attention_probs_rescaled = self_attention_scores_rescaled.softmax(dim=-1)
-
-        # Continuing the self attention normally using the new matrix
-        hidden_states = torch.bmm(self_attention_probs_rescaled, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + input_hidden_states
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
-    def get_tokenized_prompt(self, prompt):
-        out = self.tokenizer(prompt)
-        return [self.tokenizer.decode(x) for x in out["input_ids"]]
-
-    def init_attn_processors(
-        self,
-        mask,
-        token_idx,
-        use_painta=True,
-        use_rasg=True,
-        painta_scale_factors=[2, 4],  # 64x64 -> [16x16, 32x32]
-        rasg_scale_factor=4,  # 64x64 -> 16x16
-        self_attention_layer_name="attn1",
-        cross_attention_layer_name="attn2",
-        list_of_painta_layer_names=None,
-        list_of_rasg_layer_names=None,
-    ):
-        default_processor = AttnProcessor()
-        width, height = mask.shape[-2:]
-        width, height = width // self.vae_scale_factor, height // self.vae_scale_factor
-
-        painta_scale_factors = [x * self.vae_scale_factor for x in painta_scale_factors]
-        rasg_scale_factor = self.vae_scale_factor * rasg_scale_factor
-
-        attn_processors = {}
-        for x in self.unet.attn_processors:
-            if (list_of_painta_layer_names is None and self_attention_layer_name in x) or (
-                list_of_painta_layer_names is not None and x in list_of_painta_layer_names
-            ):
-                if use_painta:
-                    transformer_block = self.unet.get_submodule(x.replace(".attn1.processor", ""))
-                    attn_processors[x] = PAIntAAttnProcessor(
-                        transformer_block, mask, token_idx, self.do_classifier_free_guidance, painta_scale_factors
-                    )
-                else:
-                    attn_processors[x] = default_processor
-            elif (list_of_rasg_layer_names is None and cross_attention_layer_name in x) or (
-                list_of_rasg_layer_names is not None and x in list_of_rasg_layer_names
-            ):
-                if use_rasg:
-                    attn_processors[x] = RASGAttnProcessor(mask, token_idx, rasg_scale_factor)
-                else:
-                    attn_processors[x] = default_processor
-
-        self.unet.set_attn_processor(attn_processors)
-        # import json
-        # with open('/home/hayk.manukyan/repos/diffusers/debug.txt', 'a')  as f:
-        #     json.dump({x:str(y) for x,y in self.unet.attn_processors.items()}, f, indent=4)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: PipelineImageInput = None,
-        mask_image: PipelineImageInput = None,
-        masked_image_latents: torch.FloatTensor = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        padding_mask_crop: Optional[int] = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.5,
-        positive_prompt: Optional[str] = "",
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.01,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: int = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        use_painta=True,
-        use_rasg=True,
-        self_attention_layer_name=".attn1",
-        cross_attention_layer_name=".attn2",
-        painta_scale_factors=[2, 4],  # 16 x 16 and 32 x 32
-        rasg_scale_factor=4,  # 16x16 by default
-        list_of_painta_layer_names=None,
-        list_of_rasg_layer_names=None,
-        **kwargs,
-    ):
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        #
-        prompt_no_positives = prompt
-        if isinstance(prompt, list):
-            prompt = [x + positive_prompt for x in prompt]
-        else:
-            prompt = prompt + positive_prompt
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            image,
-            mask_image,
-            height,
-            width,
-            strength,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            callback_on_step_end_tensor_inputs,
-            padding_mask_crop,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # assert batch_size == 1, "Does not work with batch size > 1 currently"
-
-        device = self._execution_device
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            self.do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            clip_skip=self.clip_skip,
-        )
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
-        # 4. set timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-        timesteps, num_inference_steps = self.get_timesteps(
-            num_inference_steps=num_inference_steps, strength=strength, device=device
-        )
-        # check that number of inference steps is not < 1 - as this doesn't make sense
-        if num_inference_steps < 1:
-            raise ValueError(
-                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
-                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
-            )
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Preprocess mask and image
-
-        if padding_mask_crop is not None:
-            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
-            resize_mode = "fill"
-        else:
-            crops_coords = None
-            resize_mode = "default"
-
-        original_image = image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
-        )
-        init_image = init_image.to(dtype=torch.float32)
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        num_channels_unet = self.unet.config.in_channels
-        return_image_latents = num_channels_unet == 4
-
-        latents_outputs = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=return_image_latents,
-        )
-
-        if return_image_latents:
-            latents, noise, image_latents = latents_outputs
-        else:
-            latents, noise = latents_outputs
-
-        # 7. Prepare mask latent variables
-        mask_condition = self.mask_processor.preprocess(
-            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
-        )
-
-        if masked_image_latents is None:
-            masked_image = init_image * (mask_condition < 0.5)
-        else:
-            masked_image = masked_image_latents
-
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask_condition,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            self.do_classifier_free_guidance,
-        )
-
-        # 7.5 Setting up HD-Painter
-
-        # Get the indices of the tokens to be modified by both RASG and PAIntA
-        token_idx = list(range(1, self.get_tokenized_prompt(prompt_no_positives).index("<|endoftext|>"))) + [
-            self.get_tokenized_prompt(prompt).index("<|endoftext|>")
-        ]
-
-        # Setting up the attention processors
-        self.init_attn_processors(
-            mask_condition,
-            token_idx,
-            use_painta,
-            use_rasg,
-            painta_scale_factors=painta_scale_factors,
-            rasg_scale_factor=rasg_scale_factor,
-            self_attention_layer_name=self_attention_layer_name,
-            cross_attention_layer_name=cross_attention_layer_name,
-            list_of_painta_layer_names=list_of_painta_layer_names,
-            list_of_rasg_layer_names=list_of_rasg_layer_names,
-        )
-
-        # 8. Check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
-            )
-
-        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if use_rasg:
-            extra_step_kwargs["generator"] = None
-
-        # 9.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-        # 9.2 Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
-        # 10. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-        painta_active = True
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                if t < 500 and painta_active:
-                    self.init_attn_processors(
-                        mask_condition,
-                        token_idx,
-                        False,
-                        use_rasg,
-                        painta_scale_factors=painta_scale_factors,
-                        rasg_scale_factor=rasg_scale_factor,
-                        self_attention_layer_name=self_attention_layer_name,
-                        cross_attention_layer_name=cross_attention_layer_name,
-                        list_of_painta_layer_names=list_of_painta_layer_names,
-                        list_of_rasg_layer_names=list_of_rasg_layer_names,
-                    )
-                    painta_active = False
-
-                with torch.enable_grad():
-                    self.unet.zero_grad()
-                    latents = latents.detach()
-                    latents.requires_grad = True
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-
-                    # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    if num_channels_unet == 9:
-                        latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
-
-                    self.scheduler.latents = latents
-                    self.encoder_hidden_states = prompt_embeds
-                    for attn_processor in self.unet.attn_processors.values():
-                        attn_processor.encoder_hidden_states = prompt_embeds
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        timestep_cond=timestep_cond,
-                        cross_attention_kwargs=self.cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                        return_dict=False,
-                    )[0]
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    if use_rasg:
-                        # Perform RASG
-                        _, _, height, width = mask_condition.shape  # 512 x 512
-                        scale_factor = self.vae_scale_factor * rasg_scale_factor  # 8 * 4 = 32
-
-                        # TODO: Fix for > 1 batch_size
-                        rasg_mask = F.interpolate(
-                            mask_condition, (height // scale_factor, width // scale_factor), mode="bicubic"
-                        )[0, 0]  # mode is nearest by default, B, H, W
-
-                        # Aggregate the saved attention maps
-                        attn_map = []
-                        for processor in self.unet.attn_processors.values():
-                            if hasattr(processor, "attention_scores") and processor.attention_scores is not None:
-                                if self.do_classifier_free_guidance:
-                                    attn_map.append(processor.attention_scores.chunk(2)[1])  # (B/2) x H, 256, 77
-                                else:
-                                    attn_map.append(processor.attention_scores)  # B x H, 256, 77 ?
-
-                        attn_map = (
-                            torch.cat(attn_map)
-                            .mean(0)
-                            .permute(1, 0)
-                            .reshape((-1, height // scale_factor, width // scale_factor))
-                        )  # 77, 16, 16
-
-                        # Compute the attention score
-                        attn_score = -sum(
-                            [
-                                F.binary_cross_entropy_with_logits(x - 1.0, rasg_mask.to(device))
-                                for x in attn_map[token_idx]
-                            ]
-                        )
-
-                        # Backward the score and compute the gradients
-                        attn_score.backward()
-
-                        # Normalzie the gradients and compute the noise component
-                        variance_noise = latents.grad.detach()
-                        # print("VARIANCE SHAPE", variance_noise.shape)
-                        variance_noise -= torch.mean(variance_noise, [1, 2, 3], keepdim=True)
-                        variance_noise /= torch.std(variance_noise, [1, 2, 3], keepdim=True)
-                    else:
-                        variance_noise = None
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False, variance_noise=variance_noise
-                )[0]
-
-                if num_channels_unet == 4:
-                    init_latents_proper = image_latents
-                    if self.do_classifier_free_guidance:
-                        init_mask, _ = mask.chunk(2)
-                    else:
-                        init_mask = mask
-
-                    if i < len(timesteps) - 1:
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_proper, noise, torch.tensor([noise_timestep])
-                        )
-
-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    mask = callback_outputs.pop("mask", mask)
-                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, t, latents)
-
-        if not output_type == "latent":
-            condition_kwargs = {}
-            if isinstance(self.vae, AsymmetricAutoencoderKL):
-                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
-                init_image_condition = init_image.clone()
-                init_image = self._encode_vae_image(init_image, generator=generator)
-                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
-                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
-            image = self.vae.decode(
-                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
-            )[0]
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if padding_mask_crop is not None:
-            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-
-# ============= Utility Functions ============== #
-
-
-class GaussianSmoothing(nn.Module):
-    """
-    Apply gaussian smoothing on a
-    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
-    in the input using a depthwise convolution.
-    Arguments:
-        channels (int, sequence): Number of channels of the input tensors. Output will
-            have this number of channels as well.
-        kernel_size (int, sequence): Size of the gaussian kernel.
-        sigma (float, sequence): Standard deviation of the gaussian kernel.
-        dim (int, optional): The number of dimensions of the data.
-            Default value is 2 (spatial).
-    """
-
-    def __init__(self, channels, kernel_size, sigma, dim=2):
-        super(GaussianSmoothing, self).__init__()
-        if isinstance(kernel_size, numbers.Number):
-            kernel_size = [kernel_size] * dim
-        if isinstance(sigma, numbers.Number):
-            sigma = [sigma] * dim
-
-        # The gaussian kernel is the product of the
-        # gaussian function of each dimension.
-        kernel = 1
-        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
-        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
-            mean = (size - 1) / 2
-            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
-
-        # Make sure sum of values in gaussian kernel equals 1.
-        kernel = kernel / torch.sum(kernel)
-
-        # Reshape to depthwise convolutional weight
-        kernel = kernel.view(1, 1, *kernel.size())
-        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
-
-        self.register_buffer("weight", kernel)
-        self.groups = channels
-
-        if dim == 1:
-            self.conv = F.conv1d
-        elif dim == 2:
-            self.conv = F.conv2d
-        elif dim == 3:
-            self.conv = F.conv3d
-        else:
-            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
-
-    def forward(self, input):
-        """
-        Apply gaussian filter to input.
-        Arguments:
-            input (torch.Tensor): Input to apply gaussian filter on.
-        Returns:
-            filtered (torch.Tensor): Filtered output.
-        """
-        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups, padding="same")
-
-
-def get_attention_scores(
-    self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
-) -> torch.Tensor:
-    r"""
-    Compute the attention scores.
-
-    Args:
-        query (`torch.Tensor`): The query tensor.
-        key (`torch.Tensor`): The key tensor.
-        attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
-
-    Returns:
-        `torch.Tensor`: The attention probabilities/scores.
-    """
-    if self.upcast_attention:
-        query = query.float()
-        key = key.float()
-
-    if attention_mask is None:
-        baddbmm_input = torch.empty(
-            query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
-        )
-        beta = 0
-    else:
-        baddbmm_input = attention_mask
-        beta = 1
-
-    attention_scores = torch.baddbmm(
-        baddbmm_input,
-        query,
-        key.transpose(-1, -2),
-        beta=beta,
-        alpha=self.scale,
-    )
-    del baddbmm_input
-
-    if self.upcast_softmax:
-        attention_scores = attention_scores.float()
-
-    return attention_scores
@@ -1,8 +1,7 @@
 """
-modeled after the textual_inversion.py / train_dreambooth.py and the work
-of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+    modeled after the textual_inversion.py / train_dreambooth.py and the work
+    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
 """
-
 import inspect
 import warnings
 from typing import List, Optional, Union
@@ -440,7 +440,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -726,7 +726,7 @@ class LatentConsistencyModelWalkPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
+                `._callback_tensor_inputs` attribute of your pipeine class.
            embedding_interpolation_type (`str`, *optional*, defaults to `"lerp"`):
                The type of interpolation to use for interpolating between text embeddings. Choose between `"lerp"` and `"slerp"`.
            latent_interpolation_type (`str`, *optional*, defaults to `"slerp"`):
@@ -779,7 +779,7 @@ class LatentConsistencyModelWalkPipeline(
        else:
            batch_size = prompt_embeds.shape[0]
        if batch_size < 2:
-            raise ValueError(f"`prompt` must have length of at least 2 but found {batch_size}")
+            raise ValueError(f"`prompt` must have length of atleast 2 but found {batch_size}")
        if num_images_per_prompt != 1:
            raise ValueError("`num_images_per_prompt` must be `1` as no other value is supported yet")
        if prompt_embeds is not None:
@@ -883,7 +883,7 @@ class LatentConsistencyModelWalkPipeline(
                ) as batch_progress_bar:
                    for batch_index in range(0, bs, process_batch_size):
                        batch_inference_latents = inference_latents[batch_index : batch_index + process_batch_size]
-                        batch_inference_embeddings = inference_embeddings[
+                        batch_inference_embedddings = inference_embeddings[
                            batch_index : batch_index + process_batch_size
                        ]

@@ -892,7 +892,7 @@ class LatentConsistencyModelWalkPipeline(
                        )
                        timesteps = self.scheduler.timesteps

-                        current_bs = batch_inference_embeddings.shape[0]
+                        current_bs = batch_inference_embedddings.shape[0]
                        w = torch.tensor(self.guidance_scale - 1).repeat(current_bs)
                        w_embedding = self.get_guidance_scale_embedding(
                            w, embedding_dim=self.unet.config.time_cond_proj_dim
@@ -901,14 +901,14 @@ class LatentConsistencyModelWalkPipeline(
                        # 10. Perform inference for current batch
                        with self.progress_bar(total=num_inference_steps) as progress_bar:
                            for index, t in enumerate(timesteps):
-                                batch_inference_latents = batch_inference_latents.to(batch_inference_embeddings.dtype)
+                                batch_inference_latents = batch_inference_latents.to(batch_inference_embedddings.dtype)

                                # model prediction (v-prediction, eps, x)
                                model_pred = self.unet(
                                    batch_inference_latents,
                                    t,
                                    timestep_cond=w_embedding,
-                                    encoder_hidden_states=batch_inference_embeddings,
+                                    encoder_hidden_states=batch_inference_embedddings,
                                    cross_attention_kwargs=self.cross_attention_kwargs,
                                    return_dict=False,
                                )[0]
@@ -924,8 +924,8 @@ class LatentConsistencyModelWalkPipeline(
                                    callback_outputs = callback_on_step_end(self, index, t, callback_kwargs)

                                    batch_inference_latents = callback_outputs.pop("latents", batch_inference_latents)
-                                    batch_inference_embeddings = callback_outputs.pop(
-                                        "prompt_embeds", batch_inference_embeddings
+                                    batch_inference_embedddings = callback_outputs.pop(
+                                        "prompt_embeds", batch_inference_embedddings
                                    )
                                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
                                    denoised = callback_outputs.pop("denoised", denoised)
@@ -939,7 +939,7 @@ class LatentConsistencyModelWalkPipeline(
                                        step_idx = index // getattr(self.scheduler, "order", 1)
                                        callback(step_idx, t, batch_inference_latents)

-                        denoised = denoised.to(batch_inference_embeddings.dtype)
+                        denoised = denoised.to(batch_inference_embedddings.dtype)

                        # Note: This is not supported because you would get black images in your latent walk if
                        #       NSFW concept is detected
@@ -348,7 +348,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -530,7 +530,7 @@ class LLMGroundedDiffusionPipeline(
                )

        if len(phrases) != len(boxes):
-            raise ValueError(
+            ValueError(
                "length of `phrases` and `boxes` has to be same, but"
                f" got: `phrases` {len(phrases)} != `boxes` {len(boxes)}"
            )
@@ -164,7 +164,7 @@ def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
        text_tokens (list)
            A list contains token ids
        text_weight (list)
-            A list contains the correspondent weight of token ids
+            A list contains the correspodent weight of token ids

    Example:
        import torch
@@ -1028,7 +1028,7 @@ class SDXLLongPromptWeightingPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -1531,7 +1531,7 @@ class SDXLLongPromptWeightingPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
+                `._callback_tensor_inputs` attribute of your pipeine class.

        Examples:

@@ -2131,7 +2131,7 @@ class SDXLLongPromptWeightingPipeline(
            **kwargs,
        )

-    # Override to properly handle the loading and unloading of the additional text encoder.
+    # Overrride to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
        # it here explicitly to be able to tell that it's coming from an SDXL
@@ -18,7 +18,6 @@
 # --------------------------------------------------------------------------


-import logging
 import math
 from typing import Dict, Union

@@ -26,7 +25,6 @@ import matplotlib
 import numpy as np
 import torch
 from PIL import Image
-from PIL.Image import Resampling
 from scipy.optimize import minimize
 from torch.utils.data import DataLoader, TensorDataset
 from tqdm.auto import tqdm
@@ -36,14 +34,13 @@ from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
    DiffusionPipeline,
-    LCMScheduler,
    UNet2DConditionModel,
 )
 from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0")
+check_min_version("0.28.0.dev0")


 class MarigoldDepthOutput(BaseOutput):
@@ -64,19 +61,6 @@ class MarigoldDepthOutput(BaseOutput):
    uncertainty: Union[None, np.ndarray]


-def get_pil_resample_method(method_str: str) -> Resampling:
-    resample_method_dic = {
-        "bilinear": Resampling.BILINEAR,
-        "bicubic": Resampling.BICUBIC,
-        "nearest": Resampling.NEAREST,
-    }
-    resample_method = resample_method_dic.get(method_str, None)
-    if resample_method is None:
-        raise ValueError(f"Unknown resampling method: {resample_method}")
-    else:
-        return resample_method
-
-
 class MarigoldPipeline(DiffusionPipeline):
    """
    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
@@ -129,9 +113,7 @@ class MarigoldPipeline(DiffusionPipeline):
        ensemble_size: int = 10,
        processing_res: int = 768,
        match_input_res: bool = True,
-        resample_method: str = "bilinear",
        batch_size: int = 0,
-        seed: Union[int, None] = None,
        color_map: str = "Spectral",
        show_progress_bar: bool = True,
        ensemble_kwargs: Dict = None,
@@ -147,9 +129,7 @@ class MarigoldPipeline(DiffusionPipeline):
                If set to 0: will not resize at all.
            match_input_res (`bool`, *optional*, defaults to `True`):
                Resize depth prediction to match input resolution.
-                Only valid if `processing_res` > 0.
-            resample_method: (`str`, *optional*, defaults to `bilinear`):
-                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
+                Only valid if `limit_input_res` is not None.
            denoising_steps (`int`, *optional*, defaults to `10`):
                Number of diffusion denoising steps (DDIM) during inference.
            ensemble_size (`int`, *optional*, defaults to `10`):
@@ -157,8 +137,6 @@ class MarigoldPipeline(DiffusionPipeline):
            batch_size (`int`, *optional*, defaults to `0`):
                Inference batch size, no bigger than `num_ensemble`.
                If set to 0, the script will automatically decide the proper batch size.
-            seed (`int`, *optional*, defaults to `None`)
-                Reproducibility seed.
            show_progress_bar (`bool`, *optional*, defaults to `True`):
                Display a progress bar of diffusion denoising.
            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
@@ -168,7 +146,8 @@ class MarigoldPipeline(DiffusionPipeline):
        Returns:
            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
-            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
+            - **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
+                    values in [0, 1]. None if `color_map` is `None`
            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
                    coming from ensembling. None if `ensemble_size = 1`
        """
@@ -179,21 +158,13 @@ class MarigoldPipeline(DiffusionPipeline):
        if not match_input_res:
            assert processing_res is not None, "Value error: `resize_output_back` is only valid with "
        assert processing_res >= 0
+        assert denoising_steps >= 1
        assert ensemble_size >= 1

-        # Check if denoising step is reasonable
-        self._check_inference_step(denoising_steps)
-
-        resample_method: Resampling = get_pil_resample_method(resample_method)
-
        # ----------------- Image Preprocess -----------------
        # Resize image
        if processing_res > 0:
-            input_image = self.resize_max_res(
-                input_image,
-                max_edge_resolution=processing_res,
-                resample_method=resample_method,
-            )
+            input_image = self.resize_max_res(input_image, max_edge_resolution=processing_res)
        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
        input_image = input_image.convert("RGB")
        image = np.asarray(input_image)
@@ -232,10 +203,9 @@ class MarigoldPipeline(DiffusionPipeline):
                rgb_in=batched_img,
                num_inference_steps=denoising_steps,
                show_pbar=show_progress_bar,
-                seed=seed,
            )
-            depth_pred_ls.append(depth_pred_raw.detach())
-        depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze()
+            depth_pred_ls.append(depth_pred_raw.detach().clone())
+        depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
        torch.cuda.empty_cache()  # clear vram cache for ensembling

        # ----------------- Test-time ensembling -----------------
@@ -257,7 +227,7 @@ class MarigoldPipeline(DiffusionPipeline):
        # Resize back to original resolution
        if match_input_res:
            pred_img = Image.fromarray(depth_pred)
-            pred_img = pred_img.resize(input_size, resample=resample_method)
+            pred_img = pred_img.resize(input_size)
            depth_pred = np.asarray(pred_img)

        # Clip output range
@@ -273,32 +243,12 @@ class MarigoldPipeline(DiffusionPipeline):
            depth_colored_img = Image.fromarray(depth_colored_hwc)
        else:
            depth_colored_img = None
-
        return MarigoldDepthOutput(
            depth_np=depth_pred,
            depth_colored=depth_colored_img,
            uncertainty=pred_uncert,
        )

-    def _check_inference_step(self, n_step: int):
-        """
-        Check if denoising step is reasonable
-        Args:
-            n_step (`int`): denoising steps
-        """
-        assert n_step >= 1
-
-        if isinstance(self.scheduler, DDIMScheduler):
-            if n_step < 10:
-                logging.warning(
-                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
-                )
-        elif isinstance(self.scheduler, LCMScheduler):
-            if not 1 <= n_step <= 4:
-                logging.warning(f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps.")
-        else:
-            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
-
    def _encode_empty_text(self):
        """
        Encode text embedding for empty prompt.
@@ -315,13 +265,7 @@ class MarigoldPipeline(DiffusionPipeline):
        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)

    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        num_inference_steps: int,
-        seed: Union[int, None],
-        show_pbar: bool,
-    ) -> torch.Tensor:
+    def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int, show_pbar: bool) -> torch.Tensor:
        """
        Perform an individual depth prediction without ensembling.

@@ -342,20 +286,10 @@ class MarigoldPipeline(DiffusionPipeline):
        timesteps = self.scheduler.timesteps  # [T]

        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)
+        rgb_latent = self._encode_rgb(rgb_in)

        # Initial depth map (noise)
-        if seed is None:
-            rand_num_generator = None
-        else:
-            rand_num_generator = torch.Generator(device=device)
-            rand_num_generator.manual_seed(seed)
-        depth_latent = torch.randn(
-            rgb_latent.shape,
-            device=device,
-            dtype=self.dtype,
-            generator=rand_num_generator,
-        )  # [B, 4, h, w]
+        depth_latent = torch.randn(rgb_latent.shape, device=device, dtype=self.dtype)  # [B, 4, h, w]

        # Batched empty text embedding
        if self.empty_text_embed is None:
@@ -380,9 +314,9 @@ class MarigoldPipeline(DiffusionPipeline):
            noise_pred = self.unet(unet_input, t, encoder_hidden_states=batch_empty_text_embed).sample  # [B, 4, h, w]

            # compute the previous noisy sample x_t -> x_t-1
-            depth_latent = self.scheduler.step(noise_pred, t, depth_latent, generator=rand_num_generator).prev_sample
-
-        depth = self.decode_depth(depth_latent)
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
+        torch.cuda.empty_cache()
+        depth = self._decode_depth(depth_latent)

        # clip prediction
        depth = torch.clip(depth, -1.0, 1.0)
@@ -391,7 +325,7 @@ class MarigoldPipeline(DiffusionPipeline):

        return depth

-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+    def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
        """
        Encode RGB image into latent.

@@ -410,7 +344,7 @@ class MarigoldPipeline(DiffusionPipeline):
        rgb_latent = mean * self.rgb_latent_scale_factor
        return rgb_latent

-    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+    def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
        """
        Decode depth latent into depth map.

@@ -431,7 +365,7 @@ class MarigoldPipeline(DiffusionPipeline):
        return depth_mean

    @staticmethod
-    def resize_max_res(img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR) -> Image.Image:
+    def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
        """
        Resize image to limit maximum edge length while keeping aspect ratio.

@@ -440,8 +374,6 @@ class MarigoldPipeline(DiffusionPipeline):
                Image to be resized.
            max_edge_resolution (`int`):
                Maximum edge length (pixel).
-            resample_method (`PIL.Image.Resampling`):
-                Resampling method used to resize images.

        Returns:
            `Image.Image`: Resized image.
@@ -452,7 +384,7 @@ class MarigoldPipeline(DiffusionPipeline):
        new_width = int(original_width * downscale_factor)
        new_height = int(original_height * downscale_factor)

-        resized_img = img.resize((new_width, new_height), resample=resample_method)
+        resized_img = img.resize((new_width, new_height))
        return resized_img

    @staticmethod
@@ -196,7 +196,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
-            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
+            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
            cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.

@@ -325,7 +325,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

-        # Mask for tile weights strength
+        # Mask for tile weights strenght
        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)

        # Diffusion timesteps
@@ -832,7 +832,7 @@ class AnimateDiffControlNetPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
+            allback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
@@ -840,7 +840,7 @@ class AnimateDiffControlNetPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
+                `._callback_tensor_inputs` attribute of your pipeine class.

        Examples:

@@ -1280,7 +1280,7 @@ class DemoFusionSDXLPipeline(

        return output_images

-    # Override to properly handle the loading and unloading of the additional text encoder.
+    # Overrride to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
        # it here explicitly to be able to tell that it's coming from an SDXL
@@ -887,7 +887,7 @@ class StyleAlignedSDXLPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -26,7 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
 from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
 from diffusers.utils import (
    USE_PEFT_BACKEND,
@@ -206,7 +206,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
            dimensions: ``batch x channels x height x width``.
    """

-    # checkpoint. #TODO(Yiyi) - need to clean this up later
+    # checkpoint. TOD(Yiyi) - need to clean this up later
    if image is None:
        raise ValueError("`image` input cannot be undefined.")

@@ -277,7 +277,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
        # images are in latent space and thus can't
        # be masked set masked_image to None
        # we assume that the checkpoint is not an inpainting
-        # checkpoint. #TODO(Yiyi) - need to clean this up later
+        # checkpoint. TOD(Yiyi) - need to clean this up later
        masked_image = None
    else:
        masked_image = image * (mask < 0.5)
@@ -1073,7 +1073,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -46,11 +46,6 @@ except Exception:

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-logger.warning(
-    "To use instant id pipelines, please make sure you have the `insightface` library installed: `pip install insightface`."
-    "Please refer to: https://huggingface.co/InstantX/InstantID for further instructions regarding inference"
-)
-

 def FeedForward(dim, mult=4):
    inner_dim = int(dim * mult)
@@ -706,7 +701,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
+                `._callback_tensor_inputs` attribute of your pipeine class.

        Examples:

@@ -81,7 +81,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -1,7 +1,6 @@
 """
-modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 """
-
 import inspect
 from typing import Callable, List, Optional, Union

@@ -224,7 +224,7 @@ class StableDiffusionIPEXPipeline(
        # 5. Prepare latent variables
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
-            self.unet.config.in_channels,
+            self.unet.in_channels,
            height,
            width,
            prompt_embeds.dtype,
@@ -679,7 +679,7 @@ class StableDiffusionIPEXPipeline(
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.unet.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -917,7 +917,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            text_embeddings = self.__encode_prompt(prompt, negative_prompt)

            # Pre-initialize latents
-            num_channels_latents = self.unet.config.in_channels
+            num_channels_latents = self.unet.in_channels
            latents = self.prepare_latents(
                batch_size,
                num_channels_latents,
@@ -35,6 +35,7 @@ def slerp(val, low, high):


 class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
+
    """
    Pipeline for prompt-to-prompt interpolation on CLIP text embeddings and using the UnCLIP / Dall-E to decode them to images.

@@ -48,7 +49,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
@@ -125,11 +125,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
        )

    image_logs = []
-    inference_ctx = (
-        contextlib.nullcontext()
-        if (is_final_validation or torch.backends.mps.is_available())
-        else torch.autocast("cuda")
-    )
+    inference_ctx = contextlib.nullcontext() if is_final_validation else torch.autocast("cuda")

    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
        validation_image = Image.open(validation_image).convert("RGB")
@@ -796,12 +792,6 @@ def main(args):

    logging_dir = Path(args.output_dir, args.logging_dir)

-    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

    accelerator = Accelerator(
@@ -259,17 +259,13 @@ The authors found that by using DoRA, both the learning capacity and training st
 > This is also aligned with some of the quantitative analysis shown in the paper. 

 **Usage**
-1. To use DoRA you need to upgrade the installation of `peft`: 
+1. To use DoRA you need to install `peft` from main: 
 ```bash
-pip install-U peft
+pip install git+https://github.com/huggingface/peft.git
 ```
 2. Enable DoRA training by adding this flag
 ```bash
 --use_dora
 ```
 **Inference** 
-The inference is the same as if you train a regular LoRA 🤗
-
-## Format compatibility
-
-You can pass `--output_kohya_format` to additionally generate a state dictionary which should be compatible with other platforms and tools such as Automatic 1111, Comfy, Kohya, etc. The `output_dir` will contain a file named "pytorch_lora_weights_kohya.safetensors".
+The inference is the same as if you train a regular LoRA 🤗
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and

 import argparse
+import contextlib
 import gc
 import itertools
 import json
@@ -40,7 +41,6 @@ from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
-from safetensors.torch import load_file, save_file
 from torch.utils.data import Dataset
 from torchvision import transforms
 from torchvision.transforms.functional import crop
@@ -62,9 +62,7 @@ from diffusers.optimization import get_scheduler
 from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
 from diffusers.utils import (
    check_min_version,
-    convert_all_state_dict_to_peft,
    convert_state_dict_to_diffusers,
-    convert_state_dict_to_kohya,
    convert_unet_state_dict_to_peft,
    is_wandb_available,
 )
@@ -207,18 +205,11 @@ def log_validation(
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-    enable_autocast = True
-    if torch.backends.mps.is_available() or (
-        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
-    ):
-        enable_autocast = False
-    if "playground" in args.pretrained_model_name_or_path:
-        enable_autocast = False
+    inference_ctx = (
+        contextlib.nullcontext() if "playground" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
+    )

-    with torch.autocast(
-        accelerator.device.type,
-        enabled=enable_autocast,
-    ):
+    with inference_ctx:
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]

    for tracker in accelerator.trackers:
@@ -236,8 +227,7 @@ def log_validation(
            )

    del pipeline
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()

    return images

@@ -406,11 +396,6 @@ def parse_args(input_args=None):
        default="lora-dreambooth-model",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
-    parser.add_argument(
-        "--output_kohya_format",
-        action="store_true",
-        help="Flag to additionally generate final state dict in the Kohya format so that it becomes compatible with A111, Comfy, Kohya, etc.",
-    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--resolution",
@@ -974,12 +959,6 @@ def main(args):
    if args.do_edm_style_training and args.snr_gamma is not None:
        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")

-    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
@@ -1022,8 +1001,7 @@ def main(args):
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
-            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
-            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
            if args.prior_generation_precision == "fp32":
                torch_dtype = torch.float32
            elif args.prior_generation_precision == "fp16":
@@ -1148,12 +1126,6 @@ def main(args):
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

-    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    # Move unet, vae and text_encoder to device and cast to weight_dtype
    unet.to(accelerator.device, dtype=weight_dtype)

@@ -1298,7 +1270,7 @@ def main(args):

    # Enable TF32 for faster training on Ampere GPUs,
    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32 and torch.cuda.is_available():
+    if args.allow_tf32:
        torch.backends.cuda.matmul.allow_tf32 = True

    if args.scale_lr:
@@ -1475,8 +1447,7 @@ def main(args):
    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
        del tokenizers, text_encoders
        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()

    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
    # pack the statically computed variables appropriately here. This is so that we don't
@@ -1919,11 +1890,6 @@ def main(args):
            text_encoder_lora_layers=text_encoder_lora_layers,
            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
        )
-        if args.output_kohya_format:
-            lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
-            peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
-            kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
-            save_file(kohya_state_dict, f"{args.output_dir}/pytorch_lora_weights_kohya.safetensors")

        # Final inference
        # Load previous pipeline
@@ -71,7 +71,12 @@ TORCH_DTYPE_MAPPING = {"fp32": torch.float32, "fp16": torch.float16, "bf16": tor


 def log_validation(
-    pipeline, args, accelerator, generator, global_step, is_final_validation=False, enable_autocast=True
+    pipeline,
+    args,
+    accelerator,
+    generator,
+    global_step,
+    is_final_validation=False,
 ):
    logger.info(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
@@ -91,7 +96,7 @@ def log_validation(
        else Image.open(image_url_or_path).convert("RGB")
    )(args.val_image_url_or_path)

-    with torch.autocast(accelerator.device.type, enabled=enable_autocast):
+    with torch.autocast(str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"):
        edited_images = []
        # Run inference
        for val_img_idx in range(args.num_validation_images):
@@ -492,13 +497,6 @@ def main():
            ),
        )
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
-
-    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -983,13 +981,6 @@ def main():
    if accelerator.is_main_process:
        accelerator.init_trackers("instruct-pix2pix-xl", config=vars(args))

-    # Some configurations require autocast to be disabled.
-    enable_autocast = True
-    if torch.backends.mps.is_available() or (
-        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
-    ):
-        enable_autocast = False
-
    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1202,7 +1193,6 @@ def main():
                        generator,
                        global_step,
                        is_final_validation=False,
-                        enable_autocast=enable_autocast,
                    )

                    if args.use_ema:
@@ -1252,7 +1242,6 @@ def main():
                generator,
                global_step,
                is_final_validation=True,
-                enable_autocast=enable_autocast,
            )

    accelerator.end_training()
@@ -1,121 +0,0 @@
-This project is an attempt to check if it's possible to apply to [ORPO](https://arxiv.org/abs/2403.07691) on a text-conditioned diffusion model to align it on preference data WITHOUT a reference model. The implementation is based on https://github.com/huggingface/trl/pull/1435/. 
-
-> [!WARNING] 
-> We assume that MSE in the diffusion formulation approximates the log-probs as required by ORPO (hat-tip to [@kashif](https://github.com/kashif) for the idea). So, please consider this to be extremely experimental.
-
-## Training
-
-Here's training command you can use on a 40GB A100 to validate things on a [small preference
-dataset](https://hf.co/datasets/kashif/pickascore): 
-
-```bash
-accelerate launch train_diffusion_orpo_sdxl_lora.py \
-  --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0  \
-  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
-  --output_dir="diffusion-sdxl-orpo" \
-  --mixed_precision="fp16" \
-  --dataset_name=kashif/pickascore \
-  --train_batch_size=8 \
-  --gradient_accumulation_steps=2 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --rank=8 \
-  --learning_rate=1e-5 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=2000 \
-  --checkpointing_steps=500 \
-  --run_validation --validation_steps=50 \
-  --seed="0" \
-  --report_to="wandb" \
-  --push_to_hub
-```
-
-We also provide a simple script to scale up the training on the [yuvalkirstain/pickapic_v2](https://huggingface.co/datasets/yuvalkirstain/pickapic_v2) dataset:
-
-```bash
-accelerate launch --multi_gpu train_diffusion_orpo_sdxl_lora_wds.py \
-  --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0  \
-  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
-  --dataset_path="pipe:aws s3 cp s3://diffusion-preference-opt/{00000..00644}.tar -" \
-  --output_dir="diffusion-sdxl-orpo-wds" \
-  --mixed_precision="fp16" \
-  --gradient_accumulation_steps=1 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --rank=8 \
-  --dataloader_num_workers=8 \
-  --learning_rate=3e-5 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=50000 \
-  --checkpointing_steps=2000 \
-  --run_validation --validation_steps=500 \
-  --seed="0" \
-  --report_to="wandb" \
-  --push_to_hub
-```
-
-We tested the above on a node of 8 H100s but it should also work on A100s. It requires the `webdataset` library for faster dataloading. Note that we kept the dataset shards on an S3 bucket but it should be also possible to have them stored locally. 
-
-You can use the code below to convert the original dataset into `webdataset` shards:
-
-```python
-import os
-import io
-import ray
-import webdataset as wds
-from datasets import Dataset
-from PIL import Image
-
-ray.init(num_cpus=8)
-
-
-def convert_to_image(im_bytes):
-    return Image.open(io.BytesIO(im_bytes)).convert("RGB")
-
-def main():
-    dataset_path = "/pickapic_v2/data"
-    wds_shards_path = "/pickapic_v2_webdataset"
-    # get all .parquet files in the dataset path
-    dataset_files = [
-        os.path.join(dataset_path, f)
-        for f in os.listdir(dataset_path)
-        if f.endswith(".parquet")
-    ]
-
-    @ray.remote
-    def create_shard(path):
-        # get basename of the file
-        basename = os.path.basename(path)
-        # get the shard number data-00123-of-01034.parquet -> 00123
-        shard_num = basename.split("-")[1]
-        dataset = Dataset.from_parquet(path)
-        # create a webdataset shard
-        shard = wds.TarWriter(os.path.join(wds_shards_path, f"{shard_num}.tar"))
-        
-        for i, example in enumerate(dataset):
-            wds_example = {
-                "__key__": str(i),
-                "original_prompt.txt": example["caption"],
-                "jpg_0.jpg": convert_to_image(example["jpg_0"]),
-                "jpg_1.jpg": convert_to_image(example["jpg_1"]),
-                "label_0.txt": str(example["label_0"]),
-                "label_1.txt": str(example["label_1"])
-            }
-            shard.write(wds_example)
-        shard.close()
-
-    futures = [create_shard.remote(path) for path in dataset_files]
-    ray.get(futures)
-
-
-if __name__ == "__main__":
-    main()
-```
-
-## Inference
-
-Refer to [sayakpaul/diffusion-sdxl-orpo](https://huggingface.co/sayakpaul/diffusion-sdxl-orpo) for an experimental checkpoint.
@@ -1,7 +0,0 @@
-datasets
-accelerate
-transformers
-torchvision
-wandb
-peft
-webdataset
@@ -23,7 +23,6 @@ TODO:
 6. Integrate to training x
 7. Test
 """
-
 import copy
 import random

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """

 import argparse
 import re
@@ -1005,7 +1005,7 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
+                `._callback_tensor_inputs` attribute of your pipeine class.

        Examples:

@@ -501,12 +501,6 @@ def main(args):

    logging_dir = Path(args.output_dir, args.logging_dir)

-    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(
@@ -979,13 +973,6 @@ def main(args):
    if accelerator.is_main_process:
        accelerator.init_trackers("text2image-fine-tune", config=vars(args))

-    # Some configurations require autocast to be disabled.
-    enable_autocast = True
-    if torch.backends.mps.is_available() or (
-        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
-    ):
-        enable_autocast = False
-
    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1212,10 +1199,7 @@ def main(args):
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}

-                with torch.autocast(
-                    accelerator.device.type,
-                    enabled=enable_autocast,
-                ):
+                with torch.cuda.amp.autocast():
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -590,12 +590,6 @@ def main(args):

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

-    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
-        # due to pytorch#99272, MPS does not yet support bfloat16.
-        raise ValueError(
-            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
-        )
-
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
@@ -917,7 +911,6 @@ def main(args):
        )
        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)

-    del compute_vae_encodings_fn, compute_embeddings_fn, text_encoder_one, text_encoder_two
    del text_encoders, tokenizers, vae
    gc.collect()
    torch.cuda.empty_cache()
@@ -986,13 +979,6 @@ def main(args):
        model = model._orig_mod if is_compiled_module(model) else model
        return model

-    # Some configurations require autocast to be disabled.
-    enable_autocast = True
-    if torch.backends.mps.is_available() or (
-        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
-    ):
-        enable_autocast = False
-
    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1226,10 +1212,7 @@ def main(args):
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}

-                with torch.autocast(
-                    accelerator.device.type,
-                    enabled=enable_autocast,
-                ):
+                with torch.cuda.amp.autocast():
                    images = [
                        pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
                        for _ in range(args.num_validation_images)
@@ -1284,7 +1267,7 @@ def main(args):
        if args.validation_prompt and args.num_validation_images > 0:
            pipeline = pipeline.to(accelerator.device)
            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
-            with torch.autocast(accelerator.device.type, enabled=enable_autocast):
+            with torch.cuda.amp.autocast():
                images = [
                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
                    for _ in range(args.num_validation_images)
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import json
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse

@@ -1195,9 +1195,9 @@ def superres_check_against_original(dump_path, unet_checkpoint_path):
        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model

    batch_size = 1
-    channels = model.config.in_channels // 2
-    height = model.config.sample_size
-    width = model.config.sample_size
+    channels = model.in_channels // 2
+    height = model.sample_size
+    width = model.sample_size
    height = 1024
    width = 1024

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import json
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Conversion script for the LoRA's safetensors checkpoints."""
+""" Conversion script for the LoRA's safetensors checkpoints. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the NCSNPP checkpoints."""
+""" Conversion script for the NCSNPP checkpoints. """

 import argparse
 import json
@@ -1,172 +0,0 @@
-import argparse
-import os
-
-import torch
-from safetensors import safe_open
-
-from diffusers import Transformer3DModel
-
-
-ckpt_id = "PixArt-alpha/PixArt-alpha"
-# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125
-interpolation_scale = {256: 0.5, 512: 1}
-
-
-def main(args):
-    state_dict = {}
-    with safe_open(args.orig_ckpt_path, framework="pt", device="cpu") as f:
-        for k in f.keys():
-            state_dict[k] = f.get_tensor(k)
-    converted_state_dict = {}
-
-    # Patch embeddings.
-    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
-    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
-
-    # Caption projection.
-    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
-    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
-    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
-    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
-
-    # AdaLN-single LN
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
-        "t_embedder.mlp.0.weight"
-    )
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
-        "t_embedder.mlp.2.weight"
-    )
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
-
-    # Shared norm.
-    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
-    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")
-
-    for depth in range(28):
-        # Transformer blocks.
-        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
-            f"blocks.{depth}.scale_shift_table"
-        )
-
-        # Attention is all you need 🤘
-
-        # Self attention.
-        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
-        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0)
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
-        # Projection.
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
-            f"blocks.{depth}.attn.proj.weight"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
-            f"blocks.{depth}.attn.proj.bias"
-        )
-
-        # Temporal attention.
-        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn_temp.qkv.weight"), 3, dim=0)
-        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn_temp.qkv.bias"), 3, dim=0)
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_q.weight"] = q
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_q.bias"] = q_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_k.weight"] = k
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_k.bias"] = k_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_v.weight"] = v
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_v.bias"] = v_bias
-        # Projection.
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_out.0.weight"] = state_dict.pop(
-            f"blocks.{depth}.attn_temp.proj.weight"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.attn_temporal.to_out.0.bias"] = state_dict.pop(
-            f"blocks.{depth}.attn_temp.proj.bias"
-        )
-
-        # Feed-forward.
-        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop(
-            f"blocks.{depth}.mlp.fc1.weight"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop(
-            f"blocks.{depth}.mlp.fc1.bias"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop(
-            f"blocks.{depth}.mlp.fc2.weight"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop(
-            f"blocks.{depth}.mlp.fc2.bias"
-        )
-
-        # Cross-attention.
-        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
-        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
-        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
-        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
-
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
-
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
-            f"blocks.{depth}.cross_attn.proj.weight"
-        )
-        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
-            f"blocks.{depth}.cross_attn.proj.bias"
-        )
-
-    # Final block.
-    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
-    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
-    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
-    converted_state_dict["pos_embed_temporal"] = state_dict.pop("pos_embed_temporal")
-
-    # DiT XL/2
-    transformer = Transformer3DModel(
-        sample_size=(16, args.image_size // 8, args.image_size // 8),
-        patch_size=(1, 2, 2),
-        num_layers=28,
-        attention_head_dim=72,
-        num_attention_heads=16,
-        in_channels=4,
-        out_channels=8,
-        cross_attention_dim=1152,
-        num_embeds_ada_norm=1000,
-        norm_eps=1e-6,
-        caption_channels=4096,
-    )
-    transformer.load_state_dict(converted_state_dict, strict=True)
-
-    assert transformer.pos_embed.pos_embed is not None
-    state_dict.pop("pos_embed")
-    state_dict.pop("y_embedder.y_embedding")
-    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
-
-    num_model_params = sum(p.numel() for p in transformer.parameters())
-    print(f"Total number of transformer parameters: {num_model_params}")
-
-    transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--image_size",
-        default=256,
-        type=int,
-        choices=[256, 512],
-        required=False,
-        help="Image size of pretrained model, either 256 or 512.",
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
-
-    args = parser.parse_args()
-    main(args)
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the AudioLDM2 checkpoints."""
+""" Conversion script for the AudioLDM2 checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the AudioLDM checkpoints."""
+""" Conversion script for the AudioLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the MusicLDM checkpoints."""
+""" Conversion script for the MusicLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import importlib
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Versatile Stable Diffusion checkpoints."""
+""" Conversion script for the Versatile Stable Diffusion checkpoints. """

 import argparse
 from argparse import Namespace
@@ -11,7 +11,6 @@ $ python convert_zero123_to_diffusers.py \
   --original_config_file /path/zero123/configs/sd-objaverse-finetune-c_concat-256.yaml
 ```
 """
-
 import argparse

 import torch
@@ -72,11 +72,7 @@ To create the package for PyPI.
 9. Upload the final version to the actual PyPI:
   twine upload dist/* -r pypi

-10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory. You can use the following
-    Space to fetch all the commits applicable for the release: https://huggingface.co/spaces/lysandre/github-release. Repo should
-    be `huggingface/diffusers`. `tag` should be the previous release tag (v0.26.1, for example), and `branch` should be
-    the latest release branch (v0.27.0-release, for example). It denotes all commits that have happened on branch
-    v0.27.0-release after the tag v0.26.1 was created.
+10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory.

 11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release,
    you need to go back to main before executing this.
@@ -85,8 +81,9 @@ To create the package for PyPI.
 import os
 import re
 import sys
+from distutils.core import Command

-from setuptools import Command, find_packages, setup
+from setuptools import find_packages, setup


 # IMPORTANT:
@@ -166,7 +163,7 @@ def deps_list(*pkgs):

 class DepsTableUpdateCommand(Command):
    """
-    A custom command that updates the dependency table.
+    A custom distutils command that updates the dependency table.
    usage: python setup.py deps_table_update
    """

@@ -90,7 +90,6 @@ else:
            "T2IAdapter",
            "T5FilmDecoder",
            "Transformer2DModel",
-            "Transformer3DModel",
            "UNet1DModel",
            "UNet2DConditionModel",
            "UNet2DModel",
@@ -257,7 +256,6 @@ else:
            "LEditsPPPipelineStableDiffusion",
            "LEditsPPPipelineStableDiffusionXL",
            "MusicLDMPipeline",
-            "OpenSoraPipeline",
            "PaintByExamplePipeline",
            "PIAPipeline",
            "PixArtAlphaPipeline",
@@ -485,7 +483,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            T2IAdapter,
            T5FilmDecoder,
            Transformer2DModel,
-            Transformer3DModel,
            UNet1DModel,
            UNet2DConditionModel,
            UNet2DModel,
@@ -631,7 +628,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LEditsPPPipelineStableDiffusion,
            LEditsPPPipelineStableDiffusionXL,
            MusicLDMPipeline,
-            OpenSoraPipeline,
            PaintByExamplePipeline,
            PIAPipeline,
            PixArtAlphaPipeline,
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""ConfigMixin base class and utilities."""
-
+""" ConfigMixin base class and utilities."""
 import dataclasses
 import functools
 import importlib
@@ -19,7 +19,7 @@ import torch
 from huggingface_hub.utils import validate_hf_hub_args
 from safetensors import safe_open

-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from ..utils import (
    _get_model_file,
    is_accelerate_available,
@@ -182,7 +182,7 @@ class IPAdapterMixin:
                            elif key.startswith("ip_adapter."):
                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
                else:
-                    state_dict = load_state_dict(model_file)
+                    state_dict = torch.load(model_file, map_location="cpu")
            else:
                state_dict = pretrained_model_name_or_path_or_dict

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import inspect
 import os
 from pathlib import Path
@@ -26,7 +25,7 @@ from packaging import version
 from torch import nn

 from .. import __version__
-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from ..utils import (
    USE_PEFT_BACKEND,
    _get_model_file,
@@ -37,7 +36,6 @@ from ..utils import (
    get_adapter_name,
    get_peft_kwargs,
    is_accelerate_available,
-    is_peft_version,
    is_transformers_available,
    logging,
    recurse_remove_peft_layers,
@@ -115,7 +113,7 @@ class LoraLoaderMixin:
        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
        state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)

-        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        is_correct_format = all("lora" in key for key in state_dict.keys())
        if not is_correct_format:
            raise ValueError("Invalid LoRA checkpoint.")

@@ -283,7 +281,7 @@ class LoraLoaderMixin:
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path_or_dict

@@ -453,15 +451,6 @@ class LoraLoaderMixin:
                    rank[key] = val.shape[1]

            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=True)
-            if "use_dora" in lora_config_kwargs:
-                if lora_config_kwargs["use_dora"]:
-                    if is_peft_version("<", "0.9.0"):
-                        raise ValueError(
-                            "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
-                        )
-                else:
-                    if is_peft_version("<", "0.9.0"):
-                        lora_config_kwargs.pop("use_dora")
            lora_config = LoraConfig(**lora_config_kwargs)

            # adapter_name
@@ -583,15 +572,6 @@ class LoraLoaderMixin:
                    }

                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
-                if "use_dora" in lora_config_kwargs:
-                    if lora_config_kwargs["use_dora"]:
-                        if is_peft_version("<", "0.9.0"):
-                            raise ValueError(
-                                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
-                            )
-                    else:
-                        if is_peft_version("<", "0.9.0"):
-                            lora_config_kwargs.pop("use_dora")
                lora_config = LoraConfig(**lora_config_kwargs)

                # adapter_name
@@ -674,13 +654,6 @@ class LoraLoaderMixin:
                    rank[key] = val.shape[1]

            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict)
-            if "use_dora" in lora_config_kwargs:
-                if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
-                    raise ValueError(
-                        "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
-                    )
-                else:
-                    lora_config_kwargs.pop("use_dora")
            lora_config = LoraConfig(**lora_config_kwargs)

            # adapter_name
@@ -986,7 +959,7 @@ class LoraLoaderMixin:
        self,
        adapter_names: Union[List[str], str],
        text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
-        text_encoder_weights: Optional[Union[float, List[float], List[None]]] = None,
+        text_encoder_weights: List[float] = None,
    ):
        """
        Sets the adapter layers for the text encoder.
@@ -1004,20 +977,15 @@ class LoraLoaderMixin:
            raise ValueError("PEFT backend is required for this method.")

        def process_weights(adapter_names, weights):
-            # Expand weights into a list, one entry per adapter
-            # e.g. for 2 adapters:  7 -> [7,7] ; [3, None] -> [3, None]
-            if not isinstance(weights, list):
-                weights = [weights] * len(adapter_names)
+            if weights is None:
+                weights = [1.0] * len(adapter_names)
+            elif isinstance(weights, float):
+                weights = [weights]

            if len(adapter_names) != len(weights):
                raise ValueError(
                    f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
                )
-
-            # Set None values to default of 1.0
-            # e.g. [7,7] -> [7,7] ; [3, None] -> [3,1]
-            weights = [w if w is not None else 1.0 for w in weights]
-
            return weights

        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
@@ -1065,77 +1033,17 @@ class LoraLoaderMixin:
    def set_adapters(
        self,
        adapter_names: Union[List[str], str],
-        adapter_weights: Optional[Union[float, Dict, List[float], List[Dict]]] = None,
+        adapter_weights: Optional[List[float]] = None,
    ):
-        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
-
-        adapter_weights = copy.deepcopy(adapter_weights)
-
-        # Expand weights into a list, one entry per adapter
-        if not isinstance(adapter_weights, list):
-            adapter_weights = [adapter_weights] * len(adapter_names)
-
-        if len(adapter_names) != len(adapter_weights):
-            raise ValueError(
-                f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(adapter_weights)}"
-            )
-
-        # Decompose weights into weights for unet, text_encoder and text_encoder_2
-        unet_lora_weights, text_encoder_lora_weights, text_encoder_2_lora_weights = [], [], []
-
-        list_adapters = self.get_list_adapters()  # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]}
-        all_adapters = {
-            adapter for adapters in list_adapters.values() for adapter in adapters
-        }  # eg ["adapter1", "adapter2"]
-        invert_list_adapters = {
-            adapter: [part for part, adapters in list_adapters.items() if adapter in adapters]
-            for adapter in all_adapters
-        }  # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]}
-
-        for adapter_name, weights in zip(adapter_names, adapter_weights):
-            if isinstance(weights, dict):
-                unet_lora_weight = weights.pop("unet", None)
-                text_encoder_lora_weight = weights.pop("text_encoder", None)
-                text_encoder_2_lora_weight = weights.pop("text_encoder_2", None)
-
-                if len(weights) > 0:
-                    raise ValueError(
-                        f"Got invalid key '{weights.keys()}' in lora weight dict for adapter {adapter_name}."
-                    )
-
-                if text_encoder_2_lora_weight is not None and not hasattr(self, "text_encoder_2"):
-                    logger.warning(
-                        "Lora weight dict contains text_encoder_2 weights but will be ignored because pipeline does not have text_encoder_2."
-                    )
-
-                # warn if adapter doesn't have parts specified by adapter_weights
-                for part_weight, part_name in zip(
-                    [unet_lora_weight, text_encoder_lora_weight, text_encoder_2_lora_weight],
-                    ["unet", "text_encoder", "text_encoder_2"],
-                ):
-                    if part_weight is not None and part_name not in invert_list_adapters[adapter_name]:
-                        logger.warning(
-                            f"Lora weight dict for adapter '{adapter_name}' contains {part_name}, but this will be ignored because {adapter_name} does not contain weights for {part_name}. Valid parts for {adapter_name} are: {invert_list_adapters[adapter_name]}."
-                        )
-
-            else:
-                unet_lora_weight = weights
-                text_encoder_lora_weight = weights
-                text_encoder_2_lora_weight = weights
-
-            unet_lora_weights.append(unet_lora_weight)
-            text_encoder_lora_weights.append(text_encoder_lora_weight)
-            text_encoder_2_lora_weights.append(text_encoder_2_lora_weight)
-
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        # Handle the UNET
-        unet.set_adapters(adapter_names, unet_lora_weights)
+        unet.set_adapters(adapter_names, adapter_weights)

        # Handle the Text Encoder
        if hasattr(self, "text_encoder"):
-            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder, text_encoder_lora_weights)
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder, adapter_weights)
        if hasattr(self, "text_encoder_2"):
-            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder_2, text_encoder_2_lora_weights)
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder_2, adapter_weights)

    def disable_lora(self):
        if not USE_PEFT_BACKEND:
@@ -1335,7 +1243,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
            unet_config=self.unet.config,
            **kwargs,
        )
-        is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
+        is_correct_format = all("lora" in key for key in state_dict.keys())
        if not is_correct_format:
            raise ValueError("Invalid LoRA checkpoint.")

@@ -14,7 +14,7 @@

 import re

-from ..utils import is_peft_version, logging
+from ..utils import logging


 logger = logging.get_logger(__name__)
@@ -128,15 +128,6 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
    te_state_dict = {}
    te2_state_dict = {}
    network_alphas = {}
-    is_unet_dora_lora = any("dora_scale" in k and "lora_unet_" in k for k in state_dict)
-    is_te_dora_lora = any("dora_scale" in k and ("lora_te_" in k or "lora_te1_" in k) for k in state_dict)
-    is_te2_dora_lora = any("dora_scale" in k and "lora_te2_" in k for k in state_dict)
-
-    if is_unet_dora_lora or is_te_dora_lora or is_te2_dora_lora:
-        if is_peft_version("<", "0.9.0"):
-            raise ValueError(
-                "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
-            )

    # every down weight has a corresponding up weight and potentially an alpha weight
    lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
@@ -207,19 +198,8 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
                unet_state_dict[diffusers_name] = state_dict.pop(key)
                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)

-            if is_unet_dora_lora:
-                dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down."
-                unet_state_dict[
-                    diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")
-                ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
-
-        elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
-            if lora_name.startswith(("lora_te_", "lora_te1_")):
-                key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
-            else:
-                key_to_replace = "lora_te2_"
-
-            diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
+        elif lora_name.startswith("lora_te_"):
+            diffusers_name = key.replace("lora_te_", "").replace("_", ".")
            diffusers_name = diffusers_name.replace("text.model", "text_model")
            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
@@ -227,35 +207,52 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
            if "self_attn" in diffusers_name:
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
            elif "mlp" in diffusers_name:
                # Be aware that this is the new diffusers convention and the rest of the code might
                # not utilize it yet.
                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)

-            if (is_te_dora_lora or is_te2_dora_lora) and lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
-                dora_scale_key_to_replace_te = (
-                    "_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer."
-                )
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[
-                        diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
-                    ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
-                elif lora_name.startswith("lora_te2_"):
-                    te2_state_dict[
-                        diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")
-                    ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale"))
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te1_"):
+            diffusers_name = key.replace("lora_te1_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te2_"):
+            diffusers_name = key.replace("lora_te2_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)

        # Rename the alphas so that they can be mapped appropriately.
        if lora_name_alpha in state_dict:
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Stable Diffusion checkpoints."""
+""" Conversion script for the Stable Diffusion checkpoints."""

 import os
 import re
@@ -50,8 +50,6 @@ if is_transformers_available():
 if is_accelerate_available():
    from accelerate import init_empty_weights

-    from ..models.modeling_utils import load_model_dict_into_meta
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 CONFIG_URLS = {
@@ -979,6 +977,8 @@ def create_diffusers_controlnet_model_from_ldm(
        controlnet = ControlNetModel(**diffusers_config)

    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
        unexpected_keys = load_model_dict_into_meta(
            controlnet, diffusers_format_controlnet_checkpoint, dtype=torch_dtype
        )
@@ -1155,6 +1155,8 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_
                text_model_dict[diffusers_key] = checkpoint[key]

    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
        if text_model._keys_to_ignore_on_load_unexpected is not None:
            for pat in text_model._keys_to_ignore_on_load_unexpected:
@@ -1248,6 +1250,8 @@ def create_text_encoder_from_open_clip_checkpoint(
            text_model_dict[diffusers_key] = checkpoint[key]

    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
        if text_model._keys_to_ignore_on_load_unexpected is not None:
            for pat in text_model._keys_to_ignore_on_load_unexpected:
@@ -1313,6 +1317,8 @@ def create_diffusers_unet_model_from_ldm(
        unet = UNet2DConditionModel(**unet_config)

    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
        unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint, dtype=torch_dtype)
        if unet._keys_to_ignore_on_load_unexpected is not None:
            for pat in unet._keys_to_ignore_on_load_unexpected:
@@ -1373,6 +1379,8 @@ def create_diffusers_vae_model_from_ldm(
        vae = AutoencoderKL(**vae_config)

    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
        unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint, dtype=torch_dtype)
        if vae._keys_to_ignore_on_load_unexpected is not None:
            for pat in vae._keys_to_ignore_on_load_unexpected:
@@ -18,7 +18,6 @@ import torch
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import nn

-from ..models.modeling_utils import load_state_dict
 from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging


@@ -101,7 +100,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path

@@ -31,7 +31,7 @@ from ..models.embeddings import (
    IPAdapterPlusImageProjection,
    MultiIPAdapterImageProjection,
 )
-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import (
    USE_PEFT_BACKEND,
    _get_model_file,
@@ -47,7 +47,6 @@ from .single_file_utils import (
    infer_stable_cascade_single_file_config,
    load_single_file_model_checkpoint,
 )
-from .unet_loader_utils import _maybe_expand_lora_scales
 from .utils import AttnProcsLayers


@@ -215,7 +214,7 @@ class UNet2DConditionLoadersMixin:
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path_or_dict

@@ -565,7 +564,7 @@ class UNet2DConditionLoadersMixin:
    def set_adapters(
        self,
        adapter_names: Union[List[str], str],
-        weights: Optional[Union[float, Dict, List[float], List[Dict], List[None]]] = None,
+        weights: Optional[Union[List[float], float]] = None,
    ):
        """
        Set the currently active adapters for use in the UNet.
@@ -598,9 +597,9 @@ class UNet2DConditionLoadersMixin:

        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names

-        # Expand weights into a list, one entry per adapter
-        # examples for e.g. 2 adapters:  [{...}, 7] -> [7,7] ; None -> [None, None]
-        if not isinstance(weights, list):
+        if weights is None:
+            weights = [1.0] * len(adapter_names)
+        elif isinstance(weights, float):
            weights = [weights] * len(adapter_names)

        if len(adapter_names) != len(weights):
@@ -608,13 +607,6 @@ class UNet2DConditionLoadersMixin:
                f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}."
            )

-        # Set None values to default of 1.0
-        # e.g. [{...}, 7] -> [{...}, 7] ; [None, None] -> [1.0, 1.0]
-        weights = [w if w is not None else 1.0 for w in weights]
-
-        # e.g. [{...}, 7] -> [{expanded dict...}, 7]
-        weights = _maybe_expand_lora_scales(self, weights)
-
        set_weights_and_activate_adapters(self, adapter_names, weights)

    def disable_lora(self):
@@ -1,154 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import copy
-from typing import TYPE_CHECKING, Dict, List, Union
-
-from ..utils import logging
-
-
-if TYPE_CHECKING:
-    # import here to avoid circular imports
-    from ..models import UNet2DConditionModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def _translate_into_actual_layer_name(name):
-    """Translate user-friendly name (e.g. 'mid') into actual layer name (e.g. 'mid_block.attentions.0')"""
-    if name == "mid":
-        return "mid_block.attentions.0"
-
-    updown, block, attn = name.split(".")
-
-    updown = updown.replace("down", "down_blocks").replace("up", "up_blocks")
-    block = block.replace("block_", "")
-    attn = "attentions." + attn
-
-    return ".".join((updown, block, attn))
-
-
-def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]]):
-    blocks_with_transformer = {
-        "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")],
-        "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")],
-    }
-    transformer_per_block = {"down": unet.config.layers_per_block, "up": unet.config.layers_per_block + 1}
-
-    expanded_weight_scales = [
-        _maybe_expand_lora_scales_for_one_adapter(
-            weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict()
-        )
-        for weight_for_adapter in weight_scales
-    ]
-
-    return expanded_weight_scales
-
-
-def _maybe_expand_lora_scales_for_one_adapter(
-    scales: Union[float, Dict],
-    blocks_with_transformer: Dict[str, int],
-    transformer_per_block: Dict[str, int],
-    state_dict: None,
-):
-    """
-    Expands the inputs into a more granular dictionary. See the example below for more details.
-
-    Parameters:
-        scales (`Union[float, Dict]`):
-            Scales dict to expand.
-        blocks_with_transformer (`Dict[str, int]`):
-            Dict with keys 'up' and 'down', showing which blocks have transformer layers
-        transformer_per_block (`Dict[str, int]`):
-            Dict with keys 'up' and 'down', showing how many transformer layers each block has
-
-    E.g. turns
-    ```python
-    scales = {
-        'down': 2,
-        'mid': 3,
-        'up': {
-            'block_0': 4,
-            'block_1': [5, 6, 7]
-        }
-    }
-    blocks_with_transformer = {
-        'down': [1,2],
-        'up': [0,1]
-    }
-    transformer_per_block = {
-        'down': 2,
-        'up': 3
-    }
-    ```
-    into
-    ```python
-    {
-        'down.block_1.0': 2,
-        'down.block_1.1': 2,
-        'down.block_2.0': 2,
-        'down.block_2.1': 2,
-        'mid': 3,
-        'up.block_0.0': 4,
-        'up.block_0.1': 4,
-        'up.block_0.2': 4,
-        'up.block_1.0': 5,
-        'up.block_1.1': 6,
-        'up.block_1.2': 7,
-    }
-    ```
-    """
-    if sorted(blocks_with_transformer.keys()) != ["down", "up"]:
-        raise ValueError("blocks_with_transformer needs to be a dict with keys `'down' and `'up'`")
-
-    if sorted(transformer_per_block.keys()) != ["down", "up"]:
-        raise ValueError("transformer_per_block needs to be a dict with keys `'down' and `'up'`")
-
-    if not isinstance(scales, dict):
-        # don't expand if scales is a single number
-        return scales
-
-    scales = copy.deepcopy(scales)
-
-    if "mid" not in scales:
-        scales["mid"] = 1
-
-    for updown in ["up", "down"]:
-        if updown not in scales:
-            scales[updown] = 1
-
-        # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}}
-        if not isinstance(scales[updown], dict):
-            scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]}
-
-        # eg {"down": "block_1": 1}} to {"down": "block_1": [1, 1]}}
-        for i in blocks_with_transformer[updown]:
-            block = f"block_{i}"
-            if not isinstance(scales[updown][block], list):
-                scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])]
-
-        # eg {"down": "block_1": [1, 1]}}  to {"down.block_1.0": 1, "down.block_1.1": 1}
-        for i in blocks_with_transformer[updown]:
-            block = f"block_{i}"
-            for tf_idx, value in enumerate(scales[updown][block]):
-                scales[f"{updown}.{block}.{tf_idx}"] = value
-
-        del scales[updown]
-
-    for layer in scales.keys():
-        if not any(_translate_into_actual_layer_name(layer) in module for module in state_dict.keys()):
-            raise ValueError(
-                f"Can't set lora scale for layer {layer}. It either doesn't exist in this unet or it has no attentions."
-            )
-
-    return {_translate_into_actual_layer_name(name): weight for name, weight in scales.items()}
@@ -38,7 +38,6 @@ if is_torch_available():
    _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
    _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
    _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
-    _import_structure["transformers.transformer_3d"] = ["Transformer3DModel"]
    _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
    _import_structure["unets.unet_1d"] = ["UNet1DModel"]
    _import_structure["unets.unet_2d"] = ["UNet2DModel"]
@@ -76,7 +75,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            PriorTransformer,
            T5FilmDecoder,
            Transformer2DModel,
-            Transformer3DModel,
            TransformerTemporalModel,
        )
        from .unets import (
@@ -424,7 +424,7 @@ class Attention(nn.Module):
        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
        is_lora_activated.pop("add_k_proj", None)
        is_lora_activated.pop("add_v_proj", None)
-        # 2. else it is not possible that only some layers have LoRA activated
+        # 2. else it is not posssible that only some layers have LoRA activated
        if not all(is_lora_activated.values()):
            raise ValueError(
                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
@@ -767,7 +767,18 @@ class AttnProcessor:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -1259,7 +1270,18 @@ class AttnProcessor2_0:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -2098,7 +2120,7 @@ class LoRAAttnAddedKVProcessor(nn.Module):

 class IPAdapterAttnProcessor(nn.Module):
    r"""
-    Attention processor for Multiple IP-Adapters.
+    Attention processor for Multiple IP-Adapater.

    Args:
        hidden_size (`int`):
@@ -2152,8 +2174,8 @@ class IPAdapterAttnProcessor(nn.Module):
                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
            else:
                deprecation_message = (
-                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
-                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
                )
                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
@@ -2253,7 +2275,7 @@ class IPAdapterAttnProcessor(nn.Module):

 class IPAdapterAttnProcessor2_0(torch.nn.Module):
    r"""
-    Attention processor for IP-Adapter for PyTorch 2.0.
+    Attention processor for IP-Adapater for PyTorch 2.0.

    Args:
        hidden_size (`int`):
@@ -2312,8 +2334,8 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
            else:
                deprecation_message = (
-                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
-                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
                )
                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
@@ -63,8 +63,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
        ...     "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
        ... ).to("cuda")

-        >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
-        >>> image
+        >>> pipe("horse", generator=torch.manual_seed(0)).images
        ```
    """

@@ -73,7 +72,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
        self,
        scaling_factor: float = 0.18215,
        latent_channels: int = 4,
-        sample_size: int = 32,
        encoder_act_fn: str = "silu",
        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
        encoder_double_z: bool = True,
@@ -155,16 +153,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
        self.use_slicing = False
        self.use_tiling = False

-        # only relevant if vae tiling is enabled
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-
    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
    def enable_tiling(self, use_tiling: bool = True):
        r"""
@@ -284,7 +272,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
        Args:
            x (`torch.FloatTensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain
+                Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
                tuple.

        Returns:
@@ -317,19 +305,6 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
        return_dict: bool = True,
        num_inference_steps: int = 2,
    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
-        """
-        Decodes the input latent vector `z` using the consistency decoder VAE model.
-
-        Args:
-            z (torch.FloatTensor): The input latent vector.
-            generator (Optional[torch.Generator]): The random number generator. Default is None.
-            return_dict (bool): Whether to return the output as a dictionary. Default is True.
-            num_inference_steps (int): The number of inference steps. Default is 2.
-
-        Returns:
-            Union[DecoderOutput, Tuple[torch.FloatTensor]]: The decoded output.
-
-        """
        z = (z * self.config.scaling_factor - self.means) / self.stds

        scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
@@ -370,9 +345,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
        return b

-    def tiled_encode(
-        self, x: torch.FloatTensor, return_dict: bool = True
-    ) -> Union[ConsistencyDecoderVAEOutput, Tuple]:
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
        r"""Encode a batch of images using a tiled encoder.

        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -281,7 +281,7 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
        elif encoder_hid_dim_type == "text_image_proj":
            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
            self.encoder_hid_proj = TextImageProjection(
                text_embed_dim=encoder_hid_dim,
                image_embed_dim=cross_attention_dim,
@@ -330,7 +330,7 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
        elif addition_embed_type == "text_image":
            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
            self.add_embedding = TextImageTimeEmbedding(
                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
            )
@@ -509,9 +509,6 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
            if controlnet.class_embedding:
                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())

-            if hasattr(controlnet, "add_embedding"):
-                controlnet.add_embedding.load_state_dict(unet.add_embedding.state_dict())
-
            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())

@@ -187,75 +187,6 @@ class PatchEmbed(nn.Module):
        return (latent + pos_embed).to(latent.dtype)


-class PatchEmbed3D(nn.Module):
-    """Video to Patch Embedding"""
-
-    def __init__(
-        self,
-        height=224,
-        width=224,
-        patch_size=(1, 2, 2),
-        in_channels=3,
-        embed_dim=768,
-        layer_norm=False,
-        bias=True,
-        interpolation_scale=1,
-    ):
-        super().__init__()
-
-        num_patches = (height // patch_size[1]) * (width // patch_size[2])
-        self.layer_norm = layer_norm
-        self.emed_dim = embed_dim
-
-        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
-        if layer_norm:
-            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            self.norm = None
-
-        self.patch_size = patch_size
-        # See:
-        # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
-        self.height, self.width = height // patch_size[1], width // patch_size[2]
-        self.base_size = height // patch_size[1]
-        self.interpolation_scale = interpolation_scale
-        pos_embed = get_2d_sincos_pos_embed(
-            embed_dim, int(num_patches**0.5), base_size=self.base_size, interpolation_scale=self.interpolation_scale
-        )
-        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
-
-    def forward(self, latent):
-        height, width = latent.shape[-2] // self.patch_size[1], latent.shape[-1] // self.patch_size[2]
-
-        latent = self.proj(latent)  # (B C T H W)
-
-        if self.layer_norm:
-            batch_size, _, num_frames, height, width = latent.size()
-            latent = latent.flatten(2).transpose(1, 2)
-            latent = self.norm(latent)
-            latent = latent.transpose(1, 2).view(batch_size, self.emed_dim, num_frames, height, width)
-
-        latent = latent.flatten(3).permute(0, 2, 3, 1)  # BCTHW -> BT(HW)C
-
-        # Interpolate positional embeddings if needed.
-        # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
-        if self.height != height or self.width != width:
-            pos_embed = get_2d_sincos_pos_embed(
-                embed_dim=self.pos_embed.shape[-1],
-                grid_size=(height, width),
-                base_size=self.base_size,
-                interpolation_scale=self.interpolation_scale,
-            )
-            pos_embed = torch.from_numpy(pos_embed)
-            pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
-        else:
-            pos_embed = self.pos_embed
-
-        latent = (latent + pos_embed).to(latent.dtype)
-        latent = latent.flatten(1, 2)  # BT(H*W)C -> B(T*H*W)C
-        return latent
-
-
 class TimestepEmbedding(nn.Module):
    def __init__(
        self,
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch - Flax general utilities."""
-
+""" PyTorch - Flax general utilities."""
 import re

 import jax.numpy as jnp
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch - Flax general utilities."""
+""" PyTorch - Flax general utilities."""

 from pickle import UnpicklingError

@@ -20,7 +20,6 @@ import os
 import re
 from collections import OrderedDict
 from functools import partial
-from pathlib import Path
 from typing import Any, Callable, List, Optional, Tuple, Union

 import safetensors
@@ -108,12 +107,7 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
        if file_extension == SAFETENSORS_FILE_EXTENSION:
            return safetensors.torch.load_file(checkpoint_file, device="cpu")
        else:
-            weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
-            return torch.load(
-                checkpoint_file,
-                map_location="cpu",
-                **weights_only_kwarg,
-            )
+            return torch.load(checkpoint_file, map_location="cpu")
    except Exception as e:
        try:
            with open(checkpoint_file) as f:
@@ -373,18 +367,18 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        # Save the model
        if safe_serialization:
            safetensors.torch.save_file(
-                state_dict, Path(save_directory, weights_name).as_posix(), metadata={"format": "pt"}
+                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
            )
        else:
-            torch.save(state_dict, Path(save_directory, weights_name).as_posix())
+            torch.save(state_dict, os.path.join(save_directory, weights_name))

-        logger.info(f"Model weights saved in {Path(save_directory, weights_name).as_posix()}")
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")

        if push_to_hub:
            # Create a new empty model card and eventually tag it
            model_card = load_or_create_model_card(repo_id, token=token)
            model_card = populate_model_card(model_card)
-            model_card.save(Path(save_directory, "README.md").as_posix())
+            model_card.save(os.path.join(save_directory, "README.md"))

            self._upload_folder(
                save_directory,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
patil-suraj	ea238e821b	up	2024-03-18 11:47:47 +01:00
patil-suraj	b6d1d670fc	up	2024-03-18 11:34:17 +01:00