update

[Workflows] remove installation of redundant modules from flax PR tests (#7662 )
remove installation of redundant modules from flax PR tests
2024-04-17 12:07:13 +00:00 · 2024-04-17 15:16:04 +05:30 · 2024-04-17 15:10:38 +05:30 · 2024-04-16 15:02:55 -10:00 · 2024-04-16 17:58:27 -07:00 · 2024-04-16 22:15:55 +05:30
424 changed files with 31363 additions and 5991 deletions
@@ -31,7 +31,6 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install pandas peft
@@ -20,7 +20,7 @@ env:

 jobs:
  test-build-docker-images:
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -50,7 +50,7 @@ jobs:
        if: steps.file_changes.outputs.all != ''

  build-and-push-docker-images:
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name != 'pull_request'
    
    permissions:
@@ -73,13 +73,13 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ env.REGISTRY }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
      - name: Build and push
        uses: docker/build-push-action@v3
        with:
@@ -1,6 +1,7 @@
-name: Nightly tests on main
+name: Nightly and release tests on main/release branch

 on:
+  workflow_dispatch:
  schedule:
    - cron: "0 0 * * *" # every day at midnight

@@ -12,110 +13,95 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
+  PIPELINE_USAGE_CUTOFF: 5000
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

 jobs:
-  run_nightly_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Nightly PyTorch CUDA tests on Ubuntu
-            framework: pytorch
-            runner: docker-gpu
-            image: diffusers/diffusers-pytorch-cuda
-            report: torch_cuda
-          - name: Nightly Flax TPU tests on Ubuntu
-            framework: flax
-            runner: docker-tpu
-            image: diffusers/diffusers-flax-tpu
-            report: flax_tpu
-          - name: Nightly ONNXRuntime CUDA tests on Ubuntu
-            framework: onnxruntime
-            runner: docker-gpu
-            image: diffusers/diffusers-onnxruntime-cuda
-            report: onnx_cuda
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
-    container:
-      image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
-
-    defaults:
-      run:
-        shell: bash
-
+  setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        if: ${{ matrix.config.runner == 'docker-gpu' }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
        run: |
-          nvidia-smi
+          pip install -e .
+          pip install huggingface_hub
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT

+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  run_nightly_tests_for_torch_pipelines:
+    name: Torch Pipelines CUDA Nightly Tests
+    needs: setup_torch_cuda_pipeline_matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      
      - name: Install dependencies
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
-          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
-
+      
      - name: Environment
        run: |
          python utils/print_env.py
-
-      - name: Run nightly PyTorch CUDA tests
-        if: ${{ matrix.config.framework == 'pytorch' }}
+      
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
        env:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \
-            tests/ 
-
-      - name: Run nightly Flax TPU tests
-        if: ${{ matrix.config.framework == 'flax' }}
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pytest -n 0 \
-            -s -v -k "Flax" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \
-            tests/
-
-      - name: Run nightly ONNXRuntime CUDA tests
-        if: ${{ matrix.config.framework == 'onnxruntime' }}
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "Onnx" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \ 
-            tests/
-
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
+            tests/pipelines/${{ matrix.module }}
+      
      - name: Failure short reports
        if: ${{ failure() }}
-        run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: ${{ matrix.config.report }}_test_reports
+          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
      
      - name: Generate Report and Notify Channel
@@ -124,9 +110,251 @@ jobs:
          pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

+  run_nightly_tests_for_other_torch_modules:
+    name: Torch Non-Pipelines CUDA Nightly Tests
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        module: [models, schedulers, others, examples]
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
+      if: ${{ matrix.module != 'examples'}} 
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_${{ matrix.module }}_cuda \
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
+          tests/${{ matrix.module }}
+
+    - name: Run nightly example tests with Torch
+      if: ${{ matrix.module == 'examples' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v --make-reports=examples_torch_cuda \
+          --report-log=examples_torch_cuda.log \ 
+          examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
+        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_${{ matrix.module }}_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_lora_nightly_tests:
+    name: Nightly LoRA Tests with PEFT and TORCH
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly LoRA tests with PEFT and Torch
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_lora_cuda \
+          --report-log=tests_torch_lora_cuda.log \ 
+          tests/lora
+    
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_lora_cuda_stats.txt 
+        cat reports/tests_torch_lora_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_lora_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+  
+  run_flax_tpu_tests:
+    name: Nightly Flax TPU Tests
+    runs-on: docker-tpu
+    if: github.event_name == 'schedule'
+    
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly Flax TPU tests
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_flax_tpu \
+          --report-log=tests_flax_tpu.log \ 
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: flax_tpu_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_nightly_onnx_tests:
+    name: Nightly ONNXRuntime CUDA tests on Ubuntu
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+    
+    - name: Run nightly ONNXRuntime CUDA tests
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_onnx_cuda \
+          --report-log=tests_onnx_cuda.log \ 
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ matrix.config.report }}_test_reports
+        path: reports
+    
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]
+    if: github.event_name == 'schedule'

    steps:
      - name: Checkout diffusers
@@ -32,7 +32,6 @@ jobs:
        fetch-depth: 0
    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
    - name: Environment
@@ -89,7 +88,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pip install -e [quality,test]
        python -m pip install accelerate
@@ -147,7 +145,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pip install -e [quality,test]

@@ -32,9 +32,11 @@ jobs:
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
        run: |
-          ruff check examples tests src utils scripts
-          ruff format examples tests src utils scripts --check
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY

  check_repository_consistency:
    needs: check_code_quality
@@ -49,11 +51,15 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
-      - name: Check quality
+      - name: Check repo consistency
        run: |
          python utils/check_copies.py
          python utils/check_dummies.py
          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY

  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
@@ -65,7 +71,7 @@ jobs:

    name: LoRA - ${{ matrix.lib-versions }}

-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -83,11 +89,10 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        if [ "${{ matrix.lib-versions }}" == "main" ]; then
-            python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git
+            python -m pip install -U peft@git+https://github.com/huggingface/peft.git
            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
            python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        else
@@ -102,7 +107,7 @@ jobs:
    - name: Run fast PyTorch LoRA CPU tests with PEFT backend
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_${{ matrix.config.report }} \
-          tests/lora/test_lora_layers_peft.py
+          tests/lora/
@@ -40,9 +40,11 @@ jobs:
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
        run: |
-          ruff check examples tests src utils scripts
-          ruff format examples tests src utils scripts --check
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY

  check_repository_consistency:
    needs: check_code_quality
@@ -57,11 +59,15 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
-      - name: Check quality
+      - name: Check repo consistency
        run: |
          python utils/check_copies.py
          python utils/check_dummies.py
          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY

  run_fast_tests:
    needs: [check_code_quality, check_repository_consistency]
@@ -71,22 +77,22 @@ jobs:
        config:
          - name: Fast PyTorch Pipeline CPU tests
            framework: pytorch_pipelines
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -110,7 +116,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate
@@ -124,7 +129,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/pipelines
@@ -133,7 +138,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_models' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx and not Dependency" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/models tests/schedulers tests/others
@@ -142,7 +147,7 @@ jobs:
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests
@@ -152,7 +157,7 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install peft
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

@@ -175,7 +180,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -199,7 +204,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]

@@ -21,10 +21,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
-      options: --shm-size "16gb" --ipc host
+    runs-on: ubuntu-latest
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -32,24 +29,20 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
      - name: Install dependencies
        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-
-      - name: Environment
-        run: |
-          python utils/print_env.py
-
+          pip install -e .
+          pip install huggingface_hub
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
          echo $matrix
          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-
      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -78,7 +71,6 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -128,7 +120,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -178,11 +169,10 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m pip install -U peft@git+https://github.com/huggingface/peft.git

    - name: Environment
      run: |
@@ -229,7 +219,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -277,7 +266,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -29,22 +29,22 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -68,7 +68,6 @@ jobs:

    - name: Install dependencies
      run: |
-        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]

@@ -81,7 +80,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -90,7 +89,7 @@ jobs:
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -99,7 +98,7 @@ jobs:
      if: ${{ matrix.config.framework == 'onnxruntime' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -109,7 +108,7 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install peft
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

@@ -0,0 +1,30 @@
+name: Update Diffusers metadata
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - update_diffusers_metadata*
+
+jobs:
+  update_metadata:
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup environment
+        run: |
+          pip install --upgrade pip
+          pip install datasets pandas
+          pip install .[torch]
+
+      - name: Update metadata
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
+        run: |
+          python utils/update_metadata.py --commit_sha ${{ github.sha }}
@@ -19,6 +19,16 @@ authors:
    family-names: Rasul
  - given-names: Mishig
    family-names: Davaadorj
+  - given-names: Dhruv
+    family-names: Nair
+  - given-names: Sayak
+    family-names: Paul
+  - given-names: Steven
+    family-names: Liu
+  - given-names: William
+    family-names: Berman
+  - given-names: Yiyi
+    family-names: Xu
  - given-names: Thomas
    family-names: Wolf
 repository-code: 'https://github.com/huggingface/diffusers'
@@ -42,6 +42,7 @@ repo-consistency:
 quality:
 	ruff check $(check_dirs) setup.py
 	ruff format --check $(check_dirs) setup.py
+	doc-builder style src/diffusers docs/source --max_len 119 --check_only
 	python utils/check_doc_toc.py

 # Format source code automatically and check is there are any problems left that need manual fixing
@@ -55,6 +56,7 @@ extra_style_checks:
 style:
 	ruff check $(check_dirs) setup.py --fix
 	ruff format $(check_dirs) setup.py
+	doc-builder style src/diffusers docs/source --max_len 119
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks

@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and

 ```bibtex
@misc{von-platen-etal-2022-diffusers,
-  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
  title = {Diffusers: State-of-the-art diffusion models},
  year = {2022},
  publisher = {GitHub},
@@ -12,6 +12,7 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
+                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,6 +12,7 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
+                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,6 +12,7 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
+                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,6 +12,7 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
+                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -71,7 +71,7 @@
    - local: using-diffusers/control_brightness
      title: Control image brightness
    - local: using-diffusers/weighted_prompts
-      title: Prompt weighting
+      title: Prompt techniques
    - local: using-diffusers/freeu
      title: Improve generation quality with FreeU
    title: Techniques
@@ -86,6 +86,8 @@
      title: Kandinsky
    - local: using-diffusers/controlnet
      title: ControlNet
+    - local: using-diffusers/t2i_adapter
+      title: T2I-Adapter
    - local: using-diffusers/shap-e
      title: Shap-E
    - local: using-diffusers/diffedit
@@ -170,6 +172,8 @@
      title: Token merging
    - local: optimization/deepcache
      title: DeepCache
+    - local: optimization/tgate
+      title: TGATE
    title: General optimizations
  - sections:
    - local: using-diffusers/stable_diffusion_jax_how_to
@@ -280,6 +284,10 @@
      title: ControlNet
    - local: api/pipelines/controlnet_sdxl
      title: ControlNet with Stable Diffusion XL
+    - local: api/pipelines/controlnetxs
+      title: ControlNet-XS
+    - local: api/pipelines/controlnetxs_sdxl
+      title: ControlNet-XS with Stable Diffusion XL
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -358,7 +366,7 @@
      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
      - local: api/pipelines/stable_diffusion/adapter
-        title: Stable Diffusion T2I-Adapter
+        title: T2I-Adapter
      - local: api/pipelines/stable_diffusion/gligen
        title: GLIGEN (Grounded Language-to-Image Generation)
      title: Stable Diffusion
@@ -408,6 +408,29 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

+<table>
+    <tr>
+      <th align=center>Without FreeInit enabled</th>
+      <th align=center>With FreeInit enabled</th>
+    </tr>
+    <tr>
+        <td align=center>
+          panda playing a guitar
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
+              alt="panda playing a guitar"
+              style="width: 300px;" />
+        </td>
+        <td align=center>
+          panda playing a guitar
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
+              alt="panda playing a guitar"
+              style="width: 300px;" />
+        </td>
+    </tr>
+</table>
+
 ## Using AnimateLCM

 [AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
@@ -20,7 +20,8 @@ The abstract of the paper is the following:

 *Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).*

-This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
+This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be 
+found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). 

 ## Tips

@@ -36,6 +37,8 @@ See table below for details on the three checkpoints:
 | [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 350M            | 1.1B             | 1150k             |
 | [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M            | 1.5B             | 1150k             |
 | [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M            | 1.1B             | 665k              |
+| [audioldm2-gigaspeech](https://huggingface.co/anhnct/audioldm2_gigaspeech) | Text-to-speech | 350M            | 1.1B             |10k              |
+| [audioldm2-ljspeech](https://huggingface.co/anhnct/audioldm2_ljspeech) | Text-to-speech | 350M            | 1.1B             |              |

 ### Constructing a prompt

@@ -53,7 +56,7 @@ See table below for details on the three checkpoints:
 * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation.
 * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.

-The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
+The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).

 <Tip>

@@ -1,3 +1,15 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # ControlNet-XS

 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
@@ -12,5 +24,16 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe

 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️

+<Tip>

-> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionControlNetXSPipeline
+[[autodoc]] StableDiffusionControlNetXSPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
@@ -1,3 +1,15 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # ControlNet-XS with Stable Diffusion XL

 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
@@ -12,4 +24,22 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe

 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️

-> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+<Tip warning={true}>
+
+🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
+
+</Tip>
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionXLControlNetXSPipeline
+[[autodoc]] StableDiffusionXLControlNetXSPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
@@ -10,9 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Text-to-Image Generation with Adapter Conditioning
-
-## Overview
+# T2I-Adapter

 [T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.08453) by Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, Xiaohu Qie.

@@ -24,236 +22,26 @@ The abstract of the paper is the following:

 This model was contributed by the community contributor [HimariO](https://github.com/HimariO) ❤️ .

-## Available Pipelines:
-
-| Pipeline | Tasks | Demo
-|---|---|:---:|
-| [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | -
-| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | -
-
-## Usage example with the base model of StableDiffusion-1.4/1.5
-
-In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
-All adapters use the same pipeline.
-
- 1. Images are first converted into the appropriate *control image* format.
- 2. The *control image* and *prompt* are passed to the [`StableDiffusionAdapterPipeline`].
-
-Let's have a look at a simple example using the [Color Adapter](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1).
-
-```python
-from diffusers.utils import load_image, make_image_grid
-
-image = load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png")
-```
-
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png)
-
-
-Then we can create our color palette by simply resizing it to 8 by 8 pixels and then scaling it back to original size.
-
-```python
-from PIL import Image
-
-color_palette = image.resize((8, 8))
-color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
-```
-
-Let's take a look at the processed image.
-
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_palette.png)
-
-
-Next, create the adapter pipeline
-
-```py
-import torch
-from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
-
-adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
-pipe = StableDiffusionAdapterPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    adapter=adapter,
-    torch_dtype=torch.float16,
-)
-pipe.to("cuda")
-```
-
-Finally, pass the prompt and control image to the pipeline
-
-```py
-# fix the random seed, so you will get the same result as the example
-generator = torch.Generator("cuda").manual_seed(7)
-
-out_image = pipe(
-    "At night, glowing cubes in front of the beach",
-    image=color_palette,
-    generator=generator,
-).images[0]
-make_image_grid([image, color_palette, out_image], rows=1, cols=3)
-```
-
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_output.png)
-
-## Usage example with the base model of StableDiffusion-XL
-
-In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-XL.
-All adapters use the same pipeline.
-
- 1. Images are first downloaded into the appropriate *control image* format.
- 2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`].
-
-Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0).
-
-```python
-from diffusers.utils import load_image, make_image_grid
-
-sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
-```
-
-![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png)
-
-Then, create the adapter pipeline
-
-```py
-import torch
-from diffusers import (
-    T2IAdapter,
-    StableDiffusionXLAdapterPipeline,
-    DDPMScheduler
-)
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0", torch_dtype=torch.float16, adapter_type="full_adapter_xl")
-scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
-
-pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
-    model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
-)
-
-pipe.to("cuda")
-```
-
-Finally, pass the prompt and control image to the pipeline
-
-```py
-# fix the random seed, so you will get the same result as the example
-generator = torch.Generator().manual_seed(42)
-
-sketch_image_out = pipe(
-    prompt="a photo of a dog in real world, high quality",
-    negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
-    image=sketch_image,
-    generator=generator,
-    guidance_scale=7.5
-).images[0]
-make_image_grid([sketch_image, sketch_image_out], rows=1, cols=2)
-```
-
-![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch_output.png)
-
-## Available checkpoints
-
-Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models).
-
-### T2I-Adapter with Stable Diffusion 1.4
-
-| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
-|---|---|---|---|
-|[TencentARC/t2iadapter_color_sd14v1](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1)<br/> *Trained with spatial color palette* | An image with 8x8 color palette.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_canny_sd14v1](https://huggingface.co/TencentARC/t2iadapter_canny_sd14v1)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_sketch_sd14v1](https://huggingface.co/TencentARC/t2iadapter_sketch_sd14v1)<br/> *Trained with [PidiNet](https://github.com/zhuoinoulu/pidinet) edge detection* | A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_depth_sd14v1](https://huggingface.co/TencentARC/t2iadapter_depth_sd14v1)<br/> *Trained with Midas depth estimation*  | A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_openpose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_openpose_sd14v1)<br/> *Trained with OpenPose bone image*  | A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_keypose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_keypose_sd14v1)<br/> *Trained with mmpose skeleton image*  | A [mmpose skeleton](https://github.com/open-mmlab/mmpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"/></a>|
-|[TencentARC/t2iadapter_seg_sd14v1](https://huggingface.co/TencentARC/t2iadapter_seg_sd14v1)<br/>*Trained with semantic segmentation*  | An [custom](https://github.com/TencentARC/T2I-Adapter/discussions/25) segmentation protocol image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"/></a> |
-|[TencentARC/t2iadapter_canny_sd15v2](https://huggingface.co/TencentARC/t2iadapter_canny_sd15v2)||
-|[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)||
-|[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)||
-|[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)||
-|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)||
-|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)||
-|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)||
-
-## Combining multiple adapters
-
-[`MultiAdapter`] can be used for applying multiple conditionings at once.
-
-Here we use the keypose adapter for the character posture and the depth adapter for creating the scene.
-
-```py
-from diffusers.utils import load_image, make_image_grid
-
-cond_keypose = load_image(
-    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
-)
-cond_depth = load_image(
-    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
-)
-cond = [cond_keypose, cond_depth]
-
-prompt = ["A man walking in an office room with a nice view"]
-```
-
-The two control images look as such:
-
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png)
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png)
-
-
-`MultiAdapter` combines keypose and depth adapters.
-
-`adapter_conditioning_scale` balances the relative influence of the different adapters.
-
-```py
-import torch
-from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
-
-adapters = MultiAdapter(
-    [
-        T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
-        T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
-    ]
-)
-adapters = adapters.to(torch.float16)
-
-pipe = StableDiffusionAdapterPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    torch_dtype=torch.float16,
-    adapter=adapters,
-).to("cuda")
-
-image = pipe(prompt, cond, adapter_conditioning_scale=[0.8, 0.8]).images[0]
-make_image_grid([cond_keypose, cond_depth, image], rows=1, cols=3)
-```
-
-![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_depth_sample_output.png)
-
-
-## T2I-Adapter vs ControlNet
-
-T2I-Adapter is similar to [ControlNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet).
-T2I-Adapter uses a smaller auxiliary network which is only run once for the entire diffusion process.
-However, T2I-Adapter performs slightly worse than ControlNet.
-
 ## StableDiffusionAdapterPipeline
+
 [[autodoc]] StableDiffusionAdapterPipeline
-	- all
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
+    - all
+    - __call__
+    - enable_attention_slicing
+    - disable_attention_slicing
+    - enable_vae_slicing
+    - disable_vae_slicing
+    - enable_xformers_memory_efficient_attention
+    - disable_xformers_memory_efficient_attention

 ## StableDiffusionXLAdapterPipeline
+
 [[autodoc]] StableDiffusionXLAdapterPipeline
-	- all
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
+    - all
+    - __call__
+    - enable_attention_slicing
+    - disable_attention_slicing
+    - enable_vae_slicing
+    - disable_vae_slicing
+    - enable_xformers_memory_efficient_attention
+    - disable_xformers_memory_efficient_attention
@@ -0,0 +1,179 @@
+# T-GATE
+
+[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
+
+Before you begin, make sure you install T-GATE.
+
+```bash
+pip install tgate
+pip install -U pytorch diffusers transformers accelerate DeepCache
+```
+
+
+To use T-GATE with a pipeline, you need to use its corresponding loader.
+
+| Pipeline | T-GATE Loader |
+|---|---|
+| PixArt | TgatePixArtLoader |
+| Stable Diffusion XL | TgateSDXLLoader |
+| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
+| Stable Diffusion | TgateSDLoader |
+| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader |
+
+Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
+
+Let's see how to enable this for several different pipelines.
+
+<hfoptions id="pipelines">
+<hfoption id="PixArt">
+
+Accelerate `PixArtAlphaPipeline` with T-GATE:
+
+```py
+import torch
+from diffusers import PixArtAlphaPipeline
+from tgate import TgatePixArtLoader
+
+pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+pipe = TgatePixArtLoader(
+       pipe,
+       gate_step=8,
+       num_inference_steps=25,
+).to("cuda")
+
+image = pipe.tgate(
+       "An alpaca made of colorful building blocks, cyberpunk.",
+        gate_step=gate_step,
+       num_inference_steps=inference_step,
+).images[0]
+```
+</hfoption>
+<hfoption id="Stable Diffusion XL"> 
+
+Accelerate `StableDiffusionXLPipeline` with T-GATE:
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import DPMSolverMultistepScheduler
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+)
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+from tgate import TgateSDXLLoader
+gate_step = 10
+inference_step = 25
+pipe = TgateSDXLLoader(
+       pipe,
+       gate_step=gate_step,
+       num_inference_steps=inference_step,
+).to("cuda")
+
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
+```
+</hfoption>
+<hfoption id="StableDiffusionXL with DeepCache">
+
+Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE:
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import DPMSolverMultistepScheduler
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+)
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+from tgate import TgateSDXLDeepCacheLoader
+gate_step = 10
+inference_step = 25
+pipe = TgateSDXLDeepCacheLoader(
+       pipe,
+       cache_interval=3,
+       cache_branch_id=0,
+).to("cuda")
+
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
+```
+</hfoption>
+<hfoption id="Latent Consistency Model">
+
+Accelerate `latent-consistency/lcm-sdxl` with T-GATE:
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import UNet2DConditionModel, LCMScheduler
+from diffusers import DPMSolverMultistepScheduler
+
+unet = UNet2DConditionModel.from_pretrained(
+    "latent-consistency/lcm-sdxl",
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    unet=unet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+from tgate import TgateSDXLLoader
+gate_step = 1
+inference_step = 4
+pipe = TgateSDXLLoader(
+       pipe,
+       gate_step=gate_step,
+       num_inference_steps=inference_step,
+       lcm=True
+).to("cuda")
+
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
+```
+</hfoption>
+</hfoptions>
+
+T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
+
+## Benchmarks
+| Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |
+|-----------------------|----------|-----------|---------|---------------------------|
+| SD-1.5                | 16.938T  | 859.520M  | 7.032s  | 23.927                    |
+| SD-1.5 w/ T-GATE       | 9.875T   | 815.557M  | 4.313s  | 20.789                    |
+| SD-2.1                | 38.041T  | 865.785M  | 16.121s | 22.609                    |
+| SD-2.1 w/ T-GATE       | 22.208T  | 815.433 M | 9.878s  | 19.940                    |
+| SD-XL                 | 149.438T | 2.570B    | 53.187s | 24.628                    |
+| SD-XL w/ T-GATE        | 84.438T  | 2.024B    | 27.932s | 22.738                    |
+| Pixart-Alpha          | 107.031T | 611.350M  | 61.502s | 38.669                    |
+| Pixart-Alpha w/ T-GATE | 65.318T  | 462.585M  | 37.867s | 35.825                    |
+| DeepCache (SD-XL)     | 57.888T  | -         | 19.931s | 23.755                    |
+| DeepCache w/ T-GATE    | 43.868T  | -         | 14.666s | 23.999                    |
+| LCM (SD-XL)           | 11.955T  | 2.570B    | 3.805s  | 25.044                    |
+| LCM w/ T-GATE          | 11.171T  | 2.024B    | 3.533s  | 25.028                    |
+| LCM (Pixart-Alpha)    | 8.563T   | 611.350M  | 4.733s  | 36.086                    |
+| LCM w/ T-GATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
+
+The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).
@@ -88,7 +88,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -54,7 +54,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -84,7 +84,7 @@ Many of the basic parameters are described in the [DreamBooth](dreambooth#script
 - `--freeze_model`: freezes the key and value parameters in the cross-attention layer; the default is `crossattn_kv`, but you can set it to `crossattn` to train all the parameters in the cross-attention layer
 - `--concepts_list`: to learn multiple concepts, provide a path to a JSON file containing the concepts
 - `--modifier_token`: a special word used to represent the learned concept
- `--initializer_token`:
+- `--initializer_token`: a special word used to initialize the embeddings of the `modifier_token`

 ### Prior preservation loss

@@ -52,6 +52,76 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h

 </Tip>

+### Device placement
+
+> [!WARNING]
+> This feature is experimental and its APIs might change in the future. 
+
+With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
+
+For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
+
+* it only works on a single GPU
+* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
+
+To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
+
+> [!WARNING]
+> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+max_memory = {0:"1GB", 1:"1GB"}
+pipeline = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16, 
+    use_safetensors=True, 
+    device_map="balanced",
+   max_memory=max_memory
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. 
+
+By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
+
+Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
+
+```py
+pipeline.reset_device_map()
+```
+
+Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
+
+```py
+print(pipeline.hf_device_map)
+```
+
+An example device map would look like so:
+
+
+```bash
+{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
+```
+
 ## PyTorch Distributed

 PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
@@ -67,7 +67,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -180,7 +180,7 @@ elif args.pretrained_model_name_or_path:
        revision=args.revision,
        use_fast=False,
    )
-    
+
 # Load scheduler and models
 noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
 text_encoder = text_encoder_cls.from_pretrained(
@@ -51,7 +51,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -89,7 +89,7 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt

 As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the InstructPix2Pix relevant parts of the script.

-The script begins by modifing the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
+The script begins by modifying the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:

 ```py
 in_channels = 8
@@ -59,7 +59,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -235,7 +235,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-prior-pokemon-model" 
+  --output_dir="kandi2-prior-pokemon-model"
 ```

 </hfoption>
@@ -259,7 +259,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
  --validation_prompts="A robot pokemon, 4k photo" \
  --report_to="wandb" \
  --push_to_hub \
-  --output_dir="kandi2-decoder-pokemon-model" 
+  --output_dir="kandi2-decoder-pokemon-model"
 ```

 </hfoption>
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -252,4 +252,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
 Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:

 - Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
+- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
@@ -59,7 +59,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -69,7 +69,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -67,7 +67,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -51,7 +51,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -53,7 +53,7 @@ accelerate config default

 Or if your environment doesn't support an interactive shell, like a notebook, you can use:

-```bash
+```py
 from accelerate.utils import write_basic_config

 write_basic_config()
@@ -173,7 +173,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torc

 caption = "A cute bird pokemon holding a shield"
 images = pipeline(
-    caption, 
+    caption,
    width=1024,
    height=1536,
    prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
@@ -45,7 +45,7 @@ Make sure to include the token `toy_face` in the prompt and then you can perform
 ```python
 prompt = "toy_face of a hacker with a hoodie"

-lora_scale= 0.9
+lora_scale = 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -114,7 +114,7 @@ To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditio
 pipe.set_adapters("toy")

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale= 0.9
+lora_scale = 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -127,11 +127,68 @@ Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditio
 pipe.disable_lora()

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale= 0.9
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 image
 ```

+![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
+
+### Customize adapters strength
+For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`].
+
+For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
+```python
+pipe.enable_lora()  # enable lora again, after we disabled it above
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-down](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_down.png)
+
+Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-mid](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mid.png)
+
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-up](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_up.png)
+
+Looks cool!
+
+This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
+```python
+adapter_weight_scales_toy = 0.5
+adapter_weight_scales_pixel = {
+    "unet": {
+        "down": 0.9,  # all transformers in the down-part will use scale 0.9
+        # "mid"  # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
+        "up": {
+            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
+            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
+        }
+    }
+}
+pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-mixed](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mixed.png)
+
 ## Manage active adapters

 You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
@@ -148,9 +148,9 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
    use_safetensors=True
 ).to("cuda")

-image = pipe(
-    prompt = "A croissant shaped like a cute bear."
-    negative_prompt = "Deformed, ugly, bad anatomy"
+image = pipeline(
+    prompt="A croissant shaped like a cute bear.",
+    negative_prompt="Deformed, ugly, bad anatomy",
    callback_on_step_end=decode_tensors,
    callback_on_step_end_tensor_inputs=["latents"],
 ).images[0]
@@ -239,5 +239,7 @@ pipeline.to("cuda")
 prompt = "柴犬、カラフルアート"

 image = pipeline(prompt=prompt).images[0]
+```

-```
+> [!TIP]
+> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
@@ -60,6 +60,23 @@ repo_id = "runwayml/stable-diffusion-v1-5"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 ```

+You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:
+
+<div class="block dark:hidden">
+	<iframe 
+        src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+<div class="hidden dark:block">
+    <iframe 
+        src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+
 ### Local pipeline

 To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
@@ -162,6 +179,210 @@ stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(
 )
 ```

+### Switch loaded pippelines
+
+There are many diffuser pipelines that use the same pre-trained model as [`StableDiffusionPipeline`] and [`StableDiffusionXLPipeline`], but they implement specific features to help you achieve better generation results. This guide will show you how to use the `from_pipe` API to create multiple pipelines without increasing memory usage. By using this approach, you can easily switch between pipelines to use different features.
+
+Let's take an example where we first create a [`StableDiffusionPipeline`] and then reuse the already loaded model components to create a [`StableDiffusionSAGPipeline`] to enhance generation quality.
+
+we will generate an image of a bear eating pizza using Stable Diffusion with the IP-Adapter
+
+```python
+from diffusers import DiffusionPipeline, StableDiffusionSAGPipeline
+import torch
+import gc
+from diffusers.utils import load_image
+from accelerate.utils import compute_module_sizes
+
+base_repo = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
+num_inference_steps = 50
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
+prompt="bear eats pizza"
+negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality"
+
+pipe_sd = DiffusionPipeline.from_pretrained(base_repo, torch_dtype=torch.float16)
+pipe_sd.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+pipe_sd.set_ip_adapter_scale(0.6)
+pipe_sd.to("cuda")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+out_sd = pipe_sd(
+    prompt=prompt,
+    negative_prompt=negative_prompt, 
+    ip_adapter_image=image,
+    num_inference_steps=num_inference_steps,
+    generator=generator,
+).images[0]
+```
+
+let’s take a look at the image and also print out the memory used 
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_0.png"/>
+</div>
+
+```python
+def bytes_to_giga_bytes(bytes):
+    return bytes / 1024 / 1024 / 1024
+print(
+    f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB"
+)
+```
+
+```bash
+Max memory allocated: 4.406213283538818 GB
+```
+
+Now, we can use `from_pipe` to switch to the SAG pipeline. 
+
+```python
+pipe_sag = StableDiffusionSAGPipeline.from_pipe(
+    pipe_sd,
+)
+```
+
+It already has IP-Adapter loaded so that you can pass the same bear image as `ip_adapter_image`
+
+```python
+generator = torch.Generator(device="cpu").manual_seed(33)
+out_sag = pipe_sag(
+    prompt = prompt, 
+    negative_prompt=negative_prompt, 
+    ip_adapter_image=image,
+    num_inference_steps=num_inference_steps,
+    generator=generator,
+    guidance_scale=1.0,
+    sag_scale=0.75).images[0]
+```
+
+You can see a pretty nice improvement in the output
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sag_1.png"/>
+</div>
+
+Now we have both `stableDiffusionPipeline` and `StableDiffusionSAGPipeline` co-existing with the same loaded model components;  You can use them interchangeably without additional memory.
+
+```
+print(
+    f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB"
+)
+```
+
+```bash
+Max memory allocated: 4.406213283538818 GB
+```
+
+Let's unload the IP adapter from the SAG pipeline. It's important to note that methods like `load_ip_adapter` and `unload_ip_adapter` modify the state of the model components. Therefore, when you use these methods on one pipeline, it will affect all other pipelines that share the same model components.
+
+```bash
+pipe_sag.unload_ip_adapter()
+```
+
+If you try to use the Stable Diffusion pipeline with IP adapter again, it will fail
+
+```bash
+generator = torch.Generator(device="cpu").manual_seed(33)
+out_sd = pipe_sd(
+    prompt=prompt,
+    negative_prompt=negative_prompt, 
+    ip_adapter_image=image,
+    num_inference_steps=num_inference_steps,
+    generator=generator,
+).images[0]
+```
+
+```bash
+AttributeError: 'NoneType' object has no attribute 'image_projection_layers'
+```
+
+Please note that the pipeline methods may not function properly on a new pipeline created using the `from_pipe` method. For instance, the `enable_model_cpu_offload` method installs hooks to the model components based on a unique offloading sequence for each pipeline. Therefore, if the models are executed in a different order in the new pipeline, the CPU offloading may not work correctly.
+
+To ensure proper functionality, we recommend re-applying the pipeline methods on the new pipeline created using the `from_pipe` method.
+
+You can also add or subtract model components when you create new pipelines. Let's now create a AnimateDiff pipeline with an additional `MotionAdapter` module
+
+```bash
+from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
+
+pipe_animate = AnimateDiffPipeline.from_pipe(pipe_sd, motion_adapter=adapter)
+pipe_animate.scheduler = DDIMScheduler.from_config(pipe_animate.scheduler.config, beta_schedule="linear")
+# load ip_adapter again and load lora weights
+pipe_animate.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+pipe_animate.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
+pipe_animate.to("cuda")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+pipe_animate.set_adapters("zoom-out", adapter_weights=0.75)
+out = pipe_animate(
+    prompt= prompt,
+    num_frames=16,
+    num_inference_steps=num_inference_steps,
+    ip_adapter_image = image,
+    generator=generator,
+).frames[0]
+export_to_gif(out, "out_animate.gif")
+```
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_animate_3.gif"/>
+</div>
+
+
+When creating multiple pipelines using the `from_pipe` method, it is important to note that the memory requirement will be determined by the pipeline with the highest memory usage. This means that regardless of the number of pipelines you create, the total memory requirement will always be the same as the highest memory requirement among the pipelines.
+
+For example, we have created three pipelines - `stableDiffusionPipeline`, `StableDiffusionSAGPipeline`, and `AnimateDiffPipeline` - and the `AnimateDiffPipeline` has the highest memory requirement, then the total memory usage will be based on the memory requirement of the `AnimateDiffPipeline`. 
+
+Therefore, creating additional pipelines will not add up to the total memory requirement. Each pipeline can be used interchangeably without any additional memory overhead.
+
+
+Did you know that you can use `from_pipe` with a community pipeline? Let me show you an example of using long negative prompt and prompt weighting!
+
+```bash
+pipe_lpw = DiffusionPipeline.from_pipe(
+    pipe_sd,
+    custom_pipeline="lpw_stable_diffusion",
+).to("cuda")
+
+prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
+neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
+generator = torch.Generator(device="cpu").manual_seed(33)
+out_lpw = pipe_lpw.text2img(
+    prompt, 
+    negative_prompt=neg_prompt, 
+    width=512,height=512,
+    max_embeddings_multiples=3, 
+    num_inference_steps=num_inference_steps,
+    generator=generator,
+    ).images[0]
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_lpw_4.png"/>
+</div>
+
+let’s run StableDiffusionPipeline with the same inputs to compare:  the result from the long prompt weighting pipeline is more aligned with the text prompt.
+
+```
+generator = torch.Generator(device="cpu").manual_seed(33)
+out_sd = pipe_sd(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    generator=generator,
+    num_inference_steps=num_inference_steps,
+).images[0]
+out_sd
+```
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_5.png"/>
+</div>
+
+
+You can easily switch between different pipelines using the `from_pipe` method, similar to turning on and off a feature on your pipeline. To switch between tasks, you can use the `from_pipe` method with `AutoPipeline`, which automatically identifies the pipeline class based on the task. You can find more information about this feature at the [AutoPipe Guide](https://huggingface.co/docs/diffusers/tutorials/autopipeline).
+
+
 ## Checkpoint variants

 A checkpoint variant is usually a checkpoint whose weights are:
@@ -153,18 +153,43 @@ image
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
 </div>

-<Tip>
-
-For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
-
-</Tip>
-
 To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:

 ```py
 pipeline.unload_lora_weights()
 ```

+### Adjust LoRA weight scale
+
+For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
+
+For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.LoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
+```python
+pipe = ... # create pipeline
+pipe.load_lora_weights(..., adapter_name="my_adapter") 
+scales = {
+    "text_encoder": 0.5,
+    "text_encoder_2": 0.5,  # only usable if pipe has a 2nd text encoder
+    "unet": {
+        "down": 0.9,  # all transformers in the down-part will use scale 0.9
+        # "mid"  # in this example "mid" is not given, therefore all transformers in the mid part will use the default scale 1.0
+        "up": {
+            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
+            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
+        }
+    }
+}
+pipe.set_adapters("my_adapter", scales)
+```
+
+This also works with multiple adapters - see [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength) for how to do it.
+
+<Tip warning={true}>
+
+Currently, [`~loaders.LoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
+
+</Tip>
+
 ### Kohya and TheLastBen

 Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
 Before you begin, make sure you have the following libraries installed:

 ```py
-!pip install -q -U diffusers transformers accelerate 
+!pip install -q -U diffusers transformers accelerate
 ```

 The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
 + frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
 ```

-Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
+Using all these tricks together should lower the memory requirement to less than 8GB VRAM.

 ## Micro-conditioning

@@ -0,0 +1,219 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# T2I-Adapter
+
+[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter for controlling and providing more accurate
+structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the
+text-to-image model and an external control signal, such as edge detection or depth estimation.
+
+The T2I-Adapter design is simple, the condition is passed to four feature extraction blocks and three downsample
+blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the
+text-to-image model. T2I-Adapter is similar to [ControlNet](controlnet) except it is smaller (~77M parameters) and
+faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse
+than ControlNet.
+
+This guide will show you how to use T2I-Adapter with different Stable Diffusion models and how you can compose multiple
+T2I-Adapters to impose more than one condition.
+
+> [!TIP]
+> There are several T2I-Adapters available for different conditions, such as color palette, depth, sketch, pose, and
+> segmentation. Check out the [TencentARC](https://hf.co/TencentARC) repository to try them out!
+
+Before you begin, make sure you have the following libraries installed.
+
+```py
+# uncomment to install the necessary libraries in Colab
+#!pip install -q diffusers accelerate controlnet-aux==0.0.7
+```
+
+## Text-to-image
+
+Text-to-image models rely on a prompt to generate an image, but sometimes, text alone may not be enough to provide more
+accurate structural guidance. T2I-Adapter allows you to provide an additional control image to guide the generation
+process. For example, you can provide a canny image (a white outline of an image on a black background) to guide the
+model to generate an image with a similar structure.
+
+<hfoptions id="stablediffusion">
+<hfoption id="Stable Diffusion 1.5">
+
+Create a canny image with the [opencv-library](https://github.com/opencv/opencv-python).
+
+```py
+import cv2
+import numpy as np
+from PIL import Image
+from diffusers.utils import load_image
+
+image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = Image.fromarray(image)
+```
+
+Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2iadapter_canny_sd15v2) and pass it to
+the [`StableDiffusionAdapterPipeline`].
+
+```py
+import torch
+from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
+
+adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_canny_sd15v2", torch_dtype=torch.float16)
+pipeline = StableDiffusionAdapterPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    adapter=adapter,
+    torch_dtype=torch.float16,
+)
+pipeline.to("cuda")
+```
+
+Finally, pass your prompt and control image to the pipeline.
+
+```py
+generator = torch.Generator("cuda").manual_seed(0)
+
+image = pipeline(
+    prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
+    image=image,
+    generator=generator,
+).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sd1.5.png"/>
+</div>
+
+</hfoption>
+<hfoption id="Stable Diffusion XL">
+
+Create a canny image with the [controlnet-aux](https://github.com/huggingface/controlnet_aux) library.
+
+```py
+from controlnet_aux.canny import CannyDetector
+from diffusers.utils import load_image
+
+canny_detector = CannyDetector()
+
+image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
+image = canny_detector(image, detect_resolution=384, image_resolution=1024)
+```
+
+Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2i-adapter-canny-sdxl-1.0) and pass it
+to the [`StableDiffusionXLAdapterPipeline`].
+
+```py
+import torch
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
+
+scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    adapter=adapter,
+    vae=vae,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipeline.to("cuda")
+```
+
+Finally, pass your prompt and control image to the pipeline.
+
+```py
+generator = torch.Generator("cuda").manual_seed(0)
+
+image = pipeline(
+  prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
+  image=image,
+  generator=generator,
+).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sdxl.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
+
+## MultiAdapter
+
+T2I-Adapters are also composable, allowing you to use more than one adapter to impose multiple control conditions on an
+image. For example, you can use a pose map to provide structural control and a depth map for depth control. This is
+enabled by the [`MultiAdapter`] class.
+
+Let's condition a text-to-image model with a pose and depth adapter. Create and place your depth and pose image and in a list.
+
+```py
+from diffusers.utils import load_image
+
+pose_image = load_image(
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
+)
+depth_image = load_image(
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
+)
+cond = [pose_image, depth_image]
+prompt = ["Santa Claus walking into an office room with a beautiful city view"]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">pose image</figcaption>
+  </div>
+</div>
+
+Load the corresponding pose and depth adapters as a list in the [`MultiAdapter`] class.
+
+```py
+import torch
+from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
+
+adapters = MultiAdapter(
+    [
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
+    ]
+)
+adapters = adapters.to(torch.float16)
+```
+
+Finally, load a [`StableDiffusionAdapterPipeline`] with the adapters, and pass your prompt and conditioned images to
+it. Use the [`adapter_conditioning_scale`] to adjust the weight of each adapter on the image.
+
+```py
+pipeline = StableDiffusionAdapterPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    torch_dtype=torch.float16,
+    adapter=adapters,
+).to("cuda")
+
+image = pipeline(prompt, cond, adapter_conditioning_scale=[0.7, 0.7]).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi.png"/>
+</div>
@@ -10,10 +10,209 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Prompt weighting
+# Prompt techniques

 [[open-in-colab]]

+Prompts are important because they describe what you want a diffusion model to generate. The best prompts are detailed, specific, and well-structured to help the model realize your vision. But crafting a great prompt takes time and effort and sometimes it may not be enough because language and words can be imprecise. This is where you need to boost your prompt with other techniques, such as prompt enhancing and prompt weighting, to get the results you want.
+
+This guide will show you how you can use these prompt techniques to generate high-quality images with lower effort and adjust the weight of certain keywords in a prompt.
+
+## Prompt engineering
+
+> [!TIP]
+> This is not an exhaustive guide on prompt engineering, but it will help you understand the necessary parts of a good prompt. We encourage you to continue experimenting with different prompts and combine them in new ways to see what works best. As you write more prompts, you'll develop an intuition for what works and what doesn't!
+
+New diffusion models do a pretty good job of generating high-quality images from a basic prompt, but it is still important to create a well-written prompt to get the best results. Here are a few tips for writing a good prompt:
+
+1. What is the image *medium*? Is it a photo, a painting, a 3D illustration, or something else?
+2. What is the image *subject*? Is it a person, animal, object, or scene?
+3. What *details* would you like to see in the image? This is where you can get really creative and have a lot of fun experimenting with different words to bring your image to life. For example, what is the lighting like? What is the vibe and aesthetic? What kind of art or illustration style are you looking for? The more specific and precise words you use, the better the model will understand what you want to generate.
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/plain-prompt.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"A photo of a banana-shaped couch in a living room"</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the windows"</figcaption>
+  </div>
+</div>
+
+## Prompt enhancing with GPT2
+
+Prompt enhancing is a technique for quickly improving prompt quality without spending too much effort constructing one. It uses a model like GPT2 pretrained on Stable Diffusion text prompts to automatically enrich a prompt with additional important keywords to generate high-quality images.
+
+The technique works by curating a list of specific keywords and forcing the model to generate those words to enhance the original prompt. This way, your prompt can be "a cat" and GPT2 can enhance the prompt to "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic".
+
+> [!TIP]
+> You should also use a [*offset noise*](https://www.crosslabs.org//blog/diffusion-with-offset-noise) LoRA to improve the contrast in bright and dark images and create better lighting overall. This [LoRA](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors) is available from [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0).
+
+Start by defining certain styles and a list of words (you can check out a more comprehensive list of [words](https://hf.co/LykosAI/GPT-Prompt-Expansion-Fooocus-v2/blob/main/positive.txt) and [styles](https://github.com/lllyasviel/Fooocus/tree/main/sdxl_styles) used by Fooocus) to enhance a prompt with.
+
+```py
+import torch
+from transformers import GenerationConfig, GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor, LogitsProcessorList
+from diffusers import StableDiffusionXLPipeline
+
+styles = {
+    "cinematic": "cinematic film still of {prompt}, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
+    "anime": "anime artwork of {prompt}, anime style, key visual, vibrant, studio anime, highly detailed",
+    "photographic": "cinematic photo of {prompt}, 35mm photograph, film, professional, 4k, highly detailed",
+    "comic": "comic of {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
+    "lineart": "line art drawing {prompt}, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
+    "pixelart": " pixel-art {prompt}, low-res, blocky, pixel art style, 8-bit graphics",
+}
+
+words = [
+    "aesthetic", "astonishing", "beautiful", "breathtaking", "composition", "contrasted", "epic", "moody", "enhanced",
+    "exceptional", "fascinating", "flawless", "glamorous", "glorious", "illumination", "impressive", "improved",
+    "inspirational", "magnificent", "majestic", "hyperrealistic", "smooth", "sharp", "focus", "stunning", "detailed",
+    "intricate", "dramatic", "high", "quality", "perfect", "light", "ultra", "highly", "radiant", "satisfying",
+    "soothing", "sophisticated", "stylish", "sublime", "terrific", "touching", "timeless", "wonderful", "unbelievable",
+    "elegant", "awesome", "amazing", "dynamic", "trendy",
+]
+```
+
+You may have noticed in the `words` list, there are certain words that can be paired together to create something more meaningful. For example, the words "high" and "quality" can be combined to create "high quality". Let's pair these words together and remove the words that can't be paired.
+
+```py
+word_pairs = ["highly detailed", "high quality", "enhanced quality", "perfect composition", "dynamic light"]
+
+def find_and_order_pairs(s, pairs):
+    words = s.split()
+    found_pairs = []
+    for pair in pairs:
+        pair_words = pair.split()
+        if pair_words[0] in words and pair_words[1] in words:
+            found_pairs.append(pair)
+            words.remove(pair_words[0])
+            words.remove(pair_words[1])
+
+    for word in words[:]:
+        for pair in pairs:
+            if word in pair.split():
+                words.remove(word)
+                break
+    ordered_pairs = ", ".join(found_pairs)
+    remaining_s = ", ".join(words)
+    return ordered_pairs, remaining_s
+```
+
+Next, implement a custom [`~transformers.LogitsProcessor`] class that assigns tokens in the `words` list a value of 0 and assigns tokens not in the `words` list a negative value so they aren't picked during generation. This way, generation is biased towards words in the `words` list. After a word from the list is used, it is also assigned a negative value so it isn't picked again.
+
+```py
+class CustomLogitsProcessor(LogitsProcessor):
+    def __init__(self, bias):
+        super().__init__()
+        self.bias = bias
+
+    def __call__(self, input_ids, scores):
+        if len(input_ids.shape) == 2:
+            last_token_id = input_ids[0, -1]
+            self.bias[last_token_id] = -1e10
+        return scores + self.bias
+
+word_ids = [tokenizer.encode(word, add_prefix_space=True)[0] for word in words]
+bias = torch.full((tokenizer.vocab_size,), -float("Inf")).to("cuda")
+bias[word_ids] = 0
+processor = CustomLogitsProcessor(bias)
+processor_list = LogitsProcessorList([processor])
+```
+
+Combine the prompt and the `cinematic` style prompt defined in the `styles` dictionary earlier.
+
+```py
+prompt = "a cat basking in the sun on a roof in Turkey"
+style = "cinematic"
+
+prompt = styles[style].format(prompt=prompt)
+prompt
+"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"
+```
+
+Load a GPT2 tokenizer and model from the [Gustavosta/MagicPrompt-Stable-Diffusion](https://huggingface.co/Gustavosta/MagicPrompt-Stable-Diffusion) checkpoint (this specific checkpoint is trained to generate prompts) to enhance the prompt.
+
+```py
+tokenizer = GPT2Tokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
+model = GPT2LMHeadModel.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16).to(
+    "cuda"
+)
+model.eval()
+
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+token_count = inputs["input_ids"].shape[1]
+max_new_tokens = 50 - token_count
+
+generation_config = GenerationConfig(
+    penalty_alpha=0.7,
+    top_k=50,
+    eos_token_id=model.config.eos_token_id,
+    pad_token_id=model.config.eos_token_id,
+    pad_token=model.config.pad_token_id,
+    do_sample=True,
+)
+
+with torch.no_grad():
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        max_new_tokens=max_new_tokens,
+        generation_config=generation_config,
+        logits_processor=proccesor_list,
+    )
+```
+
+Then you can combine the input prompt and the generated prompt. Feel free to take a look at what the generated prompt (`generated_part`) is, the word pairs that were found (`pairs`), and the remaining words (`words`). This is all packed together in the `enhanced_prompt`.
+
+```py
+output_tokens = [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in generated_ids]
+input_part, generated_part = output_tokens[0][: len(prompt)], output_tokens[0][len(prompt) :]
+pairs, words = find_and_order_pairs(generated_part, word_pairs)
+formatted_generated_part = pairs + ", " + words
+enhanced_prompt = input_part + ", " + formatted_generated_part
+enhanced_prompt
+["cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic"]
+```
+
+Finally, load a pipeline and the offset noise LoRA with a *low weight* to generate an image with the enhanced prompt.
+
+```py
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, variant="fp16"
+).to("cuda")
+
+pipeline.load_lora_weights(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    weight_name="sd_xl_offset_example-lora_1.0.safetensors",
+    adapter_name="offset",
+)
+pipeline.set_adapters(["offset"], adapter_weights=[0.2])
+
+image = pipeline(
+    enhanced_prompt,
+    width=1152,
+    height=896,
+    guidance_scale=7.5,
+    num_inference_steps=25,
+).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"a cat basking in the sun on a roof in Turkey"</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/enhanced-prompt.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"</figcaption>
+  </div>
+</div>
+
+## Prompt weighting
+
 Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).

 Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
@@ -55,7 +254,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
 </div>

-## Weighting
+### Weighting

 You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:

@@ -123,7 +322,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
 </div>

-## Blending
+### Blending

 You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!

@@ -139,7 +338,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
 </div>

-## Conjunction
+### Conjunction

 A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:

@@ -155,7 +354,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
 </div>

-## Textual inversion
+### Textual inversion

 [Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.

@@ -195,7 +394,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
 </div>

-## DreamBooth
+### DreamBooth

 [DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):

@@ -221,7 +420,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
 </div>

-## Stable Diffusion XL
+### Stable Diffusion XL

 Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # 메모리와 속도

-메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. 
+메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
 일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.

 다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
 | memory-efficient attention | 2.63s  | x3.61   |

 <em>
-   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다. 
+   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
 </em>

 ## cuDNN auto-tuner 활성화하기
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True

 ### fp32 대신 tf32 사용하기  (Ampere 및 이후 CUDA 장치들에서)

-Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. 
-기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. 
-네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. 
-이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. 
-그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다. 
+Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
+기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
+네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
+이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
+그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
 추론하기 전에 다음을 추가하기만 하면 됩니다:

 ```python
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True

 ## 반정밀도 가중치

-더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다. 
+더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
 여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
 ```

 <Tip warning={true}>
-  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다. 
+  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
 </Tip>

 ## 추가 메모리 절약을 위한 슬라이스 어텐션
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )

@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
 또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
 </Tip>

-또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다. 
+또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.


 ```Python
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    
+
    torch_dtype=torch.float16,
 )

@@ -204,7 +204,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",  
+    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
 )

@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
-        self.in_channels = pipe.unet.in_channels
+        self.in_channels = pipe.unet.config.in_channels
        self.device = pipe.unet.device

    def forward(self, latent_model_input, t, encoder_hidden_states):
@@ -387,7 +387,7 @@ with torch.inference_mode():
 | A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
 | A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|

-이를 활용하려면 다음을 만족해야 합니다: 
+이를 활용하려면 다음을 만족해야 합니다:
 - PyTorch > 1.12
 - Cuda 사용 가능
 - [xformers 라이브러리를 설치함](xformers)
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다. 
+🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.

 이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.

@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.

 정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.

-위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다. 
+위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.

 모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.

@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di

 ```py
 >>> latents = torch.randn(
-...     (batch_size, unet.in_channels, height // 8, width // 8),
+...     (batch_size, unet.config.in_channels, height // 8, width // 8),
 ...     generator=generator,
 ...     device=torch_device,
 ... )
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 | [**ControlNet**](./controlnet) | ✅ | ✅ | -
 | [**InstructPix2Pix**](./instruct_pix2pix) | ✅ | ✅ | -
-| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/reinforcement_learning/run_diffusers_locomotion.py)                    | - | - | coming soon.
+| [**Reinforcement Learning for Control**](./reinforcement_learning)                    | - | - | coming soon.

 ## Community

@@ -308,6 +308,6 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)

 ## Running on Colab Notebook
-Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_advanced_example.ipynb). 
+Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb).
 to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning)

@@ -23,6 +23,7 @@ import os
 import re
 import shutil
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Optional

@@ -656,7 +657,6 @@ def parse_args(input_args=None):
    )
    parser.add_argument(
        "--use_dora",
-        type=bool,
        action="store_true",
        default=False,
        help=(
@@ -1845,7 +1845,12 @@ def main(args):
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}

-                with torch.cuda.amp.autocast():
+            if torch.backends.mps.is_available():
+                autocast_ctx = nullcontext()
+            else:
+                autocast_ctx = torch.autocast(accelerator.device.type)
+
+                with autocast_ctx:
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and

 import argparse
-import contextlib
 import gc
 import hashlib
 import itertools
@@ -26,6 +25,7 @@ import random
 import re
 import shutil
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Optional

@@ -2192,13 +2192,12 @@ def main(args):
                # run inference
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}
-                inference_ctx = (
-                    contextlib.nullcontext()
-                    if "playground" in args.pretrained_model_name_or_path
-                    else torch.cuda.amp.autocast()
-                )
+                if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+                    autocast_ctx = nullcontext()
+                else:
+                    autocast_ctx = torch.autocast(accelerator.device.type)

-                with inference_ctx:
+                with autocast_ctx:
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -430,6 +430,9 @@ def main(args):
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False

    if accelerator.is_main_process:
        os.makedirs(args.output_dir, exist_ok=True)
@@ -0,0 +1,232 @@
+# Community Scripts
+
+**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community. 
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
+If a community script doesn't work as expected, please open an issue and ping the author on it.
+
+| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
+|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
+| Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
+| asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
+
+
+## Example usages
+
+### IP Adapter Negative Noise
+
+Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
+
+[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
+
+IP-Adapter without negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923_normal](https://github.com/huggingface/diffusers/assets/5442875/3432e25a-ece6-45f4-a3f4-fca354f40b5b)|
+
+IP-Adapter with negative noise
+|source|result|
+|---|---|
+|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923](https://github.com/huggingface/diffusers/assets/5442875/736fd15a-36ba-40c0-a7d8-6ec1ac26f788)|
+
+```python
+import torch
+
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
+from diffusers.models import ImageProjection
+from diffusers.utils import load_image
+
+
+def encode_image(
+    image_encoder,
+    feature_extractor,
+    image,
+    device,
+    num_images_per_prompt,
+    output_hidden_states=None,
+    negative_image=None,
+):
+    dtype = next(image_encoder.parameters()).dtype
+
+    if not isinstance(image, torch.Tensor):
+        image = feature_extractor(image, return_tensors="pt").pixel_values
+
+    image = image.to(device=device, dtype=dtype)
+    if output_hidden_states:
+        image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
+        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_image is None:
+            uncond_image_enc_hidden_states = image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+        else:
+            if not isinstance(negative_image, torch.Tensor):
+                negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
+            negative_image = negative_image.to(device=device, dtype=dtype)
+            uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
+
+        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_enc_hidden_states, uncond_image_enc_hidden_states
+    else:
+        image_embeds = image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+
+        return image_embeds, uncond_image_embeds
+
+
+@torch.no_grad()
+def prepare_ip_adapter_image_embeds(
+    unet,
+    image_encoder,
+    feature_extractor,
+    ip_adapter_image,
+    do_classifier_free_guidance,
+    device,
+    num_images_per_prompt,
+    ip_adapter_negative_image=None,
+):
+    if not isinstance(ip_adapter_image, list):
+        ip_adapter_image = [ip_adapter_image]
+
+    if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
+        raise ValueError(
+            f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+        )
+
+    image_embeds = []
+    for single_ip_adapter_image, image_proj_layer in zip(
+        ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
+    ):
+        output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+        single_image_embeds, single_negative_image_embeds = encode_image(
+            image_encoder,
+            feature_extractor,
+            single_ip_adapter_image,
+            device,
+            1,
+            output_hidden_state,
+            negative_image=ip_adapter_negative_image,
+        )
+        single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+        single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+            single_image_embeds = single_image_embeds.to(device)
+
+        image_embeds.append(single_image_embeds)
+
+    return image_embeds
+
+
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "RunDiffusion/Juggernaut-XL-v9",
+    torch_dtype=torch.float16,
+    vae=vae,
+    variant="fp16",
+).to("cuda")
+
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+pipeline.scheduler.config.use_karras_sigmas = True
+
+pipeline.load_ip_adapter(
+    "h94/IP-Adapter",
+    subfolder="sdxl_models",
+    weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
+    image_encoder_folder="models/image_encoder",
+)
+pipeline.set_ip_adapter_scale(0.7)
+
+ip_image = load_image("source.png")
+negative_ip_image = load_image("noise.png")
+
+image_embeds = prepare_ip_adapter_image_embeds(
+    unet=pipeline.unet,
+    image_encoder=pipeline.image_encoder,
+    feature_extractor=pipeline.feature_extractor,
+    ip_adapter_image=[[ip_image]],
+    do_classifier_free_guidance=True,
+    device="cuda",
+    num_images_per_prompt=1,
+    ip_adapter_negative_image=negative_ip_image,
+)
+
+
+prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
+negative_prompt = "blurry, smooth, plastic"
+
+image = pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    ip_adapter_image_embeds=image_embeds,
+    guidance_scale=6.0,
+    num_inference_steps=25,
+    generator=torch.Generator(device="cpu").manual_seed(1556265306),
+).images[0]
+
+image.save("result.png")
+```
+
+### Asymmetric Tiling
+Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
+
+
+|Generated|Tiled|
+|---|---|
+|![20240313003235_573631814](https://github.com/huggingface/diffusers/assets/5442875/eca174fb-06a4-464e-a3a7-00dbb024543e)|![wall](https://github.com/huggingface/diffusers/assets/5442875/b4aa774b-2a6a-4316-a8eb-8f30b5f4d024)|
+
+
+```py
+import torch
+from typing import Optional
+from diffusers import StableDiffusionPipeline
+from diffusers.models.lora import LoRACompatibleConv
+
+def seamless_tiling(pipeline, x_axis, y_axis):
+    def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+        self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
+        self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
+        working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
+        working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
+        return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
+    x_mode = 'circular' if x_axis else 'constant'
+    y_mode = 'circular' if y_axis else 'constant'
+    targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
+    convolution_layers = []
+    for target in targets:
+        for module in target.modules():
+            if isinstance(module, torch.nn.Conv2d):
+                convolution_layers.append(module)
+    for layer in convolution_layers:
+        if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
+            layer.lora_layer = lambda * x: 0
+        layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
+    return pipeline
+
+pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
+pipeline.enable_model_cpu_offload()
+prompt = ["texture of a red brick wall"]
+seed = 123456
+generator = torch.Generator(device='cuda').manual_seed(seed)
+
+pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
+image = pipeline(
+    prompt=prompt,
+    width=512,
+    height=512,
+    num_inference_steps=20,
+    guidance_scale=7,
+    num_images_per_prompt=1,
+    generator=generator
+).images[0]
+seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
+
+torch.cuda.empty_cache()
+image.save('image.png')
+```
@@ -103,7 +103,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        print(f"Combining with alpha={alpha}, interpolation mode={interp}")

        checkpoint_count = len(pretrained_model_name_or_path_list)
-        # Ignore result from model_index_json comparision of the two checkpoints
+        # Ignore result from model_index_json comparison of the two checkpoints
        force = kwargs.pop("force", False)

        # If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
@@ -217,7 +217,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                        ]
                        checkpoint_path_2 = files[0] if len(files) > 0 else None
                # For an attr if both checkpoint_path_1 and 2 are None, ignore.
-                # If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
+                # If at least one is present, deal with it according to interp method, of course only if the state_dict keys match.
                if checkpoint_path_1 is None and checkpoint_path_2 is None:
                    print(f"Skipping {attr}: not present in 2nd or 3d model")
                    continue
@@ -0,0 +1,994 @@
+import math
+import numbers
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AsymmetricAutoencoderKL, ImageProjection
+from diffusers.models.attention_processor import Attention, AttnProcessor
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+    StableDiffusionInpaintPipeline,
+    retrieve_timesteps,
+)
+from diffusers.utils import deprecate
+
+
+class RASGAttnProcessor:
+    def __init__(self, mask, token_idx, scale_factor):
+        self.attention_scores = None  # Stores the last output of the similarity matrix here. Each layer will get its own RASGAttnProcessor assigned
+        self.mask = mask
+        self.token_idx = token_idx
+        self.scale_factor = scale_factor
+        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64 if the image is 512x512
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
+        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        # Automatically recognize the resolution and save the attention similarity values
+        # We need to use the values before the softmax function, hence the rewritten get_attention_scores function.
+        if downscale_factor == self.scale_factor**2:
+            self.attention_scores = get_attention_scores(attn, query, key, attention_mask)
+            attention_probs = self.attention_scores.softmax(dim=-1)
+            attention_probs = attention_probs.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)  # Original code
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class PAIntAAttnProcessor:
+    def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidance, scale_factors):
+        self.transformer_block = transformer_block  # Stores the parent transformer block.
+        self.mask = mask
+        self.scale_factors = scale_factors
+        self.do_classifier_free_guidance = do_classifier_free_guidance
+        self.token_idx = token_idx
+        self.shape = mask.shape[2:]
+        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64
+        self.default_processor = AttnProcessor()
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
+        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
+
+        mask = None
+        for factor in self.scale_factors:
+            if downscale_factor == factor**2:
+                shape = (self.shape[0] // factor, self.shape[1] // factor)
+                mask = F.interpolate(self.mask, shape, mode="bicubic")  # B, 1, H, W
+                break
+        if mask is None:
+            return self.default_processor(attn, hidden_states, encoder_hidden_states, attention_mask, temb, scale)
+
+        # STARTS HERE
+        residual = hidden_states
+        # Save the input hidden_states for later use
+        input_hidden_states = hidden_states
+
+        # ================================================== #
+        # =============== SELF ATTENTION 1 ================= #
+        # ================================================== #
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        # self_attention_probs = attn.get_attention_scores(query, key, attention_mask) # We can't use post-softmax attention scores in this case
+        self_attention_scores = get_attention_scores(
+            attn, query, key, attention_mask
+        )  # The custom function returns pre-softmax probabilities
+        self_attention_probs = self_attention_scores.softmax(
+            dim=-1
+        )  # Manually compute the probabilities here, the scores will be reused in the second part of PAIntA
+        self_attention_probs = self_attention_probs.to(query.dtype)
+
+        hidden_states = torch.bmm(self_attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        # x = x + self.attn1(self.norm1(x))
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:  # So many residuals everywhere
+            hidden_states = hidden_states + residual
+
+        self_attention_output_hidden_states = hidden_states / attn.rescale_output_factor
+
+        # ================================================== #
+        # ============ BasicTransformerBlock =============== #
+        # ================================================== #
+        # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
+        # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
+        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
+
+        # The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
+        # But the residual of the output is the non-normalized version.
+        # Therefore we unnormalize the input hidden state here
+        unnormalized_input_hidden_states = (
+            input_hidden_states + self.transformer_block.norm1.bias
+        ) * self.transformer_block.norm1.weight
+
+        # TODO: return if neccessary
+        # if self.use_ada_layer_norm_zero:
+        #     attn_output = gate_msa.unsqueeze(1) * attn_output
+        # elif self.use_ada_layer_norm_single:
+        #     attn_output = gate_msa * attn_output
+
+        transformer_hidden_states = self_attention_output_hidden_states + unnormalized_input_hidden_states
+        if transformer_hidden_states.ndim == 4:
+            transformer_hidden_states = transformer_hidden_states.squeeze(1)
+
+        # TODO: return if neccessary
+        # 2.5 GLIGEN Control
+        # if gligen_kwargs is not None:
+        #     transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
+        # NOTE: we experimented with using GLIGEN and HDPainter together, the results were not that great
+
+        # 3. Cross-Attention
+        if self.transformer_block.use_ada_layer_norm:
+            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, timestep)
+            raise NotImplementedError()
+        elif self.transformer_block.use_ada_layer_norm_zero or self.transformer_block.use_layer_norm:
+            transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states)
+        elif self.transformer_block.use_ada_layer_norm_single:
+            # For PixArt norm2 isn't applied here:
+            # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+            transformer_norm_hidden_states = transformer_hidden_states
+        elif self.transformer_block.use_ada_layer_norm_continuous:
+            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, added_cond_kwargs["pooled_text_emb"])
+            raise NotImplementedError()
+        else:
+            raise ValueError("Incorrect norm")
+
+        if self.transformer_block.pos_embed is not None and self.transformer_block.use_ada_layer_norm_single is False:
+            transformer_norm_hidden_states = self.transformer_block.pos_embed(transformer_norm_hidden_states)
+
+        # ================================================== #
+        # ================= CROSS ATTENTION ================ #
+        # ================================================== #
+
+        # We do an initial pass of the CrossAttention up to obtaining the similarity matrix here.
+        # The similarity matrix is used to obtain scaling coefficients for the attention matrix of the self attention
+        # We reuse the previously computed self-attention matrix, and only repeat the steps after the softmax
+
+        cross_attention_input_hidden_states = (
+            transformer_norm_hidden_states  # Renaming the variable for the sake of readability
+        )
+
+        # TODO: check if classifier_free_guidance is being used before splitting here
+        if self.do_classifier_free_guidance:
+            # Our scaling coefficients depend only on the conditional part, so we split the inputs
+            (
+                _cross_attention_input_hidden_states_unconditional,
+                cross_attention_input_hidden_states_conditional,
+            ) = cross_attention_input_hidden_states.chunk(2)
+
+            # Same split for the encoder_hidden_states i.e. the tokens
+            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
+            _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
+                2
+            )
+        else:
+            cross_attention_input_hidden_states_conditional = cross_attention_input_hidden_states
+            encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(2)
+
+        # Rename the variables for the sake of readability
+        # The part below is the beginning of the __call__ function of the following CrossAttention layer
+        cross_attention_hidden_states = cross_attention_input_hidden_states_conditional
+        cross_attention_encoder_hidden_states = encoder_hidden_states_conditional
+
+        attn2 = self.transformer_block.attn2
+
+        if attn2.spatial_norm is not None:
+            cross_attention_hidden_states = attn2.spatial_norm(cross_attention_hidden_states, temb)
+
+        input_ndim = cross_attention_hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = cross_attention_hidden_states.shape
+            cross_attention_hidden_states = cross_attention_hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+
+        (
+            batch_size,
+            sequence_length,
+            _,
+        ) = cross_attention_hidden_states.shape  # It is definitely a cross attention, so no need for an if block
+        # TODO: change the attention_mask here
+        attention_mask = attn2.prepare_attention_mask(
+            None, sequence_length, batch_size
+        )  # I assume the attention mask is the same...
+
+        if attn2.group_norm is not None:
+            cross_attention_hidden_states = attn2.group_norm(cross_attention_hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+
+        query2 = attn2.to_q(cross_attention_hidden_states)
+
+        if attn2.norm_cross:
+            cross_attention_encoder_hidden_states = attn2.norm_encoder_hidden_states(
+                cross_attention_encoder_hidden_states
+            )
+
+        key2 = attn2.to_k(cross_attention_encoder_hidden_states)
+        query2 = attn2.head_to_batch_dim(query2)
+        key2 = attn2.head_to_batch_dim(key2)
+
+        cross_attention_probs = attn2.get_attention_scores(query2, key2, attention_mask)
+
+        # CrossAttention ends here, the remaining part is not used
+
+        # ================================================== #
+        # ================ SELF ATTENTION 2 ================ #
+        # ================================================== #
+        # DEJA VU!
+
+        mask = (mask > 0.5).to(self_attention_output_hidden_states.dtype)
+        m = mask.to(self_attention_output_hidden_states.device)
+        # m = rearrange(m, 'b c h w -> b (h w) c').contiguous()
+        m = m.permute(0, 2, 3, 1).reshape((m.shape[0], -1, m.shape[1])).contiguous()  # B HW 1
+        m = torch.matmul(m, m.permute(0, 2, 1)) + (1 - m)
+
+        # # Compute scaling coefficients for the similarity matrix
+        # # Select the cross attention values for the correct tokens only!
+        # cross_attention_probs = cross_attention_probs.mean(dim = 0)
+        # cross_attention_probs = cross_attention_probs[:, self.token_idx].sum(dim=1)
+
+        # cross_attention_probs = cross_attention_probs.reshape(shape)
+        # gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(self_attention_output_hidden_states.device)
+        # cross_attention_probs = gaussian_smoothing(cross_attention_probs.unsqueeze(0))[0] # optional smoothing
+        # cross_attention_probs = cross_attention_probs.reshape(-1)
+        # cross_attention_probs = ((cross_attention_probs - torch.median(cross_attention_probs.ravel())) / torch.max(cross_attention_probs.ravel())).clip(0, 1)
+
+        # c = (1 - m) * cross_attention_probs.reshape(1, 1, -1) + m # PAIntA scaling coefficients
+
+        # Compute scaling coefficients for the similarity matrix
+        # Select the cross attention values for the correct tokens only!
+
+        batch_size, dims, channels = cross_attention_probs.shape
+        batch_size = batch_size // attn.heads
+        cross_attention_probs = cross_attention_probs.reshape((batch_size, attn.heads, dims, channels))  # B, D, HW, T
+
+        cross_attention_probs = cross_attention_probs.mean(dim=1)  # B, HW, T
+        cross_attention_probs = cross_attention_probs[..., self.token_idx].sum(dim=-1)  # B, HW
+        cross_attention_probs = cross_attention_probs.reshape((batch_size,) + shape)  # , B, H, W
+
+        gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(
+            self_attention_output_hidden_states.device
+        )
+        cross_attention_probs = gaussian_smoothing(cross_attention_probs[:, None])[:, 0]  # optional smoothing B, H, W
+
+        # Median normalization
+        cross_attention_probs = cross_attention_probs.reshape(batch_size, -1)  # B, HW
+        cross_attention_probs = (
+            cross_attention_probs - cross_attention_probs.median(dim=-1, keepdim=True).values
+        ) / cross_attention_probs.max(dim=-1, keepdim=True).values
+        cross_attention_probs = cross_attention_probs.clip(0, 1)
+
+        c = (1 - m) * cross_attention_probs.reshape(batch_size, 1, -1) + m
+        c = c.repeat_interleave(attn.heads, 0)  # BD, HW
+        if self.do_classifier_free_guidance:
+            c = torch.cat([c, c])  # 2BD, HW
+
+        # Rescaling the original self-attention matrix
+        self_attention_scores_rescaled = self_attention_scores * c
+        self_attention_probs_rescaled = self_attention_scores_rescaled.softmax(dim=-1)
+
+        # Continuing the self attention normally using the new matrix
+        hidden_states = torch.bmm(self_attention_probs_rescaled, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + input_hidden_states
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
+    def get_tokenized_prompt(self, prompt):
+        out = self.tokenizer(prompt)
+        return [self.tokenizer.decode(x) for x in out["input_ids"]]
+
+    def init_attn_processors(
+        self,
+        mask,
+        token_idx,
+        use_painta=True,
+        use_rasg=True,
+        painta_scale_factors=[2, 4],  # 64x64 -> [16x16, 32x32]
+        rasg_scale_factor=4,  # 64x64 -> 16x16
+        self_attention_layer_name="attn1",
+        cross_attention_layer_name="attn2",
+        list_of_painta_layer_names=None,
+        list_of_rasg_layer_names=None,
+    ):
+        default_processor = AttnProcessor()
+        width, height = mask.shape[-2:]
+        width, height = width // self.vae_scale_factor, height // self.vae_scale_factor
+
+        painta_scale_factors = [x * self.vae_scale_factor for x in painta_scale_factors]
+        rasg_scale_factor = self.vae_scale_factor * rasg_scale_factor
+
+        attn_processors = {}
+        for x in self.unet.attn_processors:
+            if (list_of_painta_layer_names is None and self_attention_layer_name in x) or (
+                list_of_painta_layer_names is not None and x in list_of_painta_layer_names
+            ):
+                if use_painta:
+                    transformer_block = self.unet.get_submodule(x.replace(".attn1.processor", ""))
+                    attn_processors[x] = PAIntAAttnProcessor(
+                        transformer_block, mask, token_idx, self.do_classifier_free_guidance, painta_scale_factors
+                    )
+                else:
+                    attn_processors[x] = default_processor
+            elif (list_of_rasg_layer_names is None and cross_attention_layer_name in x) or (
+                list_of_rasg_layer_names is not None and x in list_of_rasg_layer_names
+            ):
+                if use_rasg:
+                    attn_processors[x] = RASGAttnProcessor(mask, token_idx, rasg_scale_factor)
+                else:
+                    attn_processors[x] = default_processor
+
+        self.unet.set_attn_processor(attn_processors)
+        # import json
+        # with open('/home/hayk.manukyan/repos/diffusers/debug.txt', 'a')  as f:
+        #     json.dump({x:str(y) for x,y in self.unet.attn_processors.items()}, f, indent=4)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        positive_prompt: Optional[str] = "",
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.01,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        use_painta=True,
+        use_rasg=True,
+        self_attention_layer_name=".attn1",
+        cross_attention_layer_name=".attn2",
+        painta_scale_factors=[2, 4],  # 16 x 16 and 32 x 32
+        rasg_scale_factor=4,  # 16x16 by default
+        list_of_painta_layer_names=None,
+        list_of_rasg_layer_names=None,
+        **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        #
+        prompt_no_positives = prompt
+        if isinstance(prompt, list):
+            prompt = [x + positive_prompt for x in prompt]
+        else:
+            prompt = prompt + positive_prompt
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # assert batch_size == 1, "Does not work with batch size > 1 currently"
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 7.5 Setting up HD-Painter
+
+        # Get the indices of the tokens to be modified by both RASG and PAIntA
+        token_idx = list(range(1, self.get_tokenized_prompt(prompt_no_positives).index("<|endoftext|>"))) + [
+            self.get_tokenized_prompt(prompt).index("<|endoftext|>")
+        ]
+
+        # Setting up the attention processors
+        self.init_attn_processors(
+            mask_condition,
+            token_idx,
+            use_painta,
+            use_rasg,
+            painta_scale_factors=painta_scale_factors,
+            rasg_scale_factor=rasg_scale_factor,
+            self_attention_layer_name=self_attention_layer_name,
+            cross_attention_layer_name=cross_attention_layer_name,
+            list_of_painta_layer_names=list_of_painta_layer_names,
+            list_of_rasg_layer_names=list_of_rasg_layer_names,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        if use_rasg:
+            extra_step_kwargs["generator"] = None
+
+        # 9.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        painta_active = True
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                if t < 500 and painta_active:
+                    self.init_attn_processors(
+                        mask_condition,
+                        token_idx,
+                        False,
+                        use_rasg,
+                        painta_scale_factors=painta_scale_factors,
+                        rasg_scale_factor=rasg_scale_factor,
+                        self_attention_layer_name=self_attention_layer_name,
+                        cross_attention_layer_name=cross_attention_layer_name,
+                        list_of_painta_layer_names=list_of_painta_layer_names,
+                        list_of_rasg_layer_names=list_of_rasg_layer_names,
+                    )
+                    painta_active = False
+
+                with torch.enable_grad():
+                    self.unet.zero_grad()
+                    latents = latents.detach()
+                    latents.requires_grad = True
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                    # concat latents, mask, masked_image_latents in the channel dimension
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    if num_channels_unet == 9:
+                        latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                    self.scheduler.latents = latents
+                    self.encoder_hidden_states = prompt_embeds
+                    for attn_processor in self.unet.attn_processors.values():
+                        attn_processor.encoder_hidden_states = prompt_embeds
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if use_rasg:
+                        # Perform RASG
+                        _, _, height, width = mask_condition.shape  # 512 x 512
+                        scale_factor = self.vae_scale_factor * rasg_scale_factor  # 8 * 4 = 32
+
+                        # TODO: Fix for > 1 batch_size
+                        rasg_mask = F.interpolate(
+                            mask_condition, (height // scale_factor, width // scale_factor), mode="bicubic"
+                        )[0, 0]  # mode is nearest by default, B, H, W
+
+                        # Aggregate the saved attention maps
+                        attn_map = []
+                        for processor in self.unet.attn_processors.values():
+                            if hasattr(processor, "attention_scores") and processor.attention_scores is not None:
+                                if self.do_classifier_free_guidance:
+                                    attn_map.append(processor.attention_scores.chunk(2)[1])  # (B/2) x H, 256, 77
+                                else:
+                                    attn_map.append(processor.attention_scores)  # B x H, 256, 77 ?
+
+                        attn_map = (
+                            torch.cat(attn_map)
+                            .mean(0)
+                            .permute(1, 0)
+                            .reshape((-1, height // scale_factor, width // scale_factor))
+                        )  # 77, 16, 16
+
+                        # Compute the attention score
+                        attn_score = -sum(
+                            [
+                                F.binary_cross_entropy_with_logits(x - 1.0, rasg_mask.to(device))
+                                for x in attn_map[token_idx]
+                            ]
+                        )
+
+                        # Backward the score and compute the gradients
+                        attn_score.backward()
+
+                        # Normalzie the gradients and compute the noise component
+                        variance_noise = latents.grad.detach()
+                        # print("VARIANCE SHAPE", variance_noise.shape)
+                        variance_noise -= torch.mean(variance_noise, [1, 2, 3], keepdim=True)
+                        variance_noise /= torch.std(variance_noise, [1, 2, 3], keepdim=True)
+                    else:
+                        variance_noise = None
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False, variance_noise=variance_noise
+                )[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
+                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+# ============= Utility Functions ============== #
+
+
+class GaussianSmoothing(nn.Module):
+    """
+    Apply gaussian smoothing on a
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+        dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    """
+
+    def __init__(self, channels, kernel_size, sigma, dim=2):
+        super(GaussianSmoothing, self).__init__()
+        if isinstance(kernel_size, numbers.Number):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, numbers.Number):
+            sigma = [sigma] * dim
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer("weight", kernel)
+        self.groups = channels
+
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
+
+    def forward(self, input):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups, padding="same")
+
+
+def get_attention_scores(
+    self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
+) -> torch.Tensor:
+    r"""
+    Compute the attention scores.
+
+    Args:
+        query (`torch.Tensor`): The query tensor.
+        key (`torch.Tensor`): The key tensor.
+        attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+    Returns:
+        `torch.Tensor`: The attention probabilities/scores.
+    """
+    if self.upcast_attention:
+        query = query.float()
+        key = key.float()
+
+    if attention_mask is None:
+        baddbmm_input = torch.empty(
+            query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+        )
+        beta = 0
+    else:
+        baddbmm_input = attention_mask
+        beta = 1
+
+    attention_scores = torch.baddbmm(
+        baddbmm_input,
+        query,
+        key.transpose(-1, -2),
+        beta=beta,
+        alpha=self.scale,
+    )
+    del baddbmm_input
+
+    if self.upcast_softmax:
+        attention_scores = attention_scores.float()
+
+    return attention_scores
@@ -1,7 +1,8 @@
 """
-    modeled after the textual_inversion.py / train_dreambooth.py and the work
-    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+modeled after the textual_inversion.py / train_dreambooth.py and the work
+of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
 """
+
 import inspect
 import warnings
 from typing import List, Optional, Union
@@ -440,7 +440,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -726,7 +726,7 @@ class LatentConsistencyModelWalkPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+                `._callback_tensor_inputs` attribute of your pipeline class.
            embedding_interpolation_type (`str`, *optional*, defaults to `"lerp"`):
                The type of interpolation to use for interpolating between text embeddings. Choose between `"lerp"` and `"slerp"`.
            latent_interpolation_type (`str`, *optional*, defaults to `"slerp"`):
@@ -779,7 +779,7 @@ class LatentConsistencyModelWalkPipeline(
        else:
            batch_size = prompt_embeds.shape[0]
        if batch_size < 2:
-            raise ValueError(f"`prompt` must have length of atleast 2 but found {batch_size}")
+            raise ValueError(f"`prompt` must have length of at least 2 but found {batch_size}")
        if num_images_per_prompt != 1:
            raise ValueError("`num_images_per_prompt` must be `1` as no other value is supported yet")
        if prompt_embeds is not None:
@@ -883,7 +883,7 @@ class LatentConsistencyModelWalkPipeline(
                ) as batch_progress_bar:
                    for batch_index in range(0, bs, process_batch_size):
                        batch_inference_latents = inference_latents[batch_index : batch_index + process_batch_size]
-                        batch_inference_embedddings = inference_embeddings[
+                        batch_inference_embeddings = inference_embeddings[
                            batch_index : batch_index + process_batch_size
                        ]

@@ -892,7 +892,7 @@ class LatentConsistencyModelWalkPipeline(
                        )
                        timesteps = self.scheduler.timesteps

-                        current_bs = batch_inference_embedddings.shape[0]
+                        current_bs = batch_inference_embeddings.shape[0]
                        w = torch.tensor(self.guidance_scale - 1).repeat(current_bs)
                        w_embedding = self.get_guidance_scale_embedding(
                            w, embedding_dim=self.unet.config.time_cond_proj_dim
@@ -901,14 +901,14 @@ class LatentConsistencyModelWalkPipeline(
                        # 10. Perform inference for current batch
                        with self.progress_bar(total=num_inference_steps) as progress_bar:
                            for index, t in enumerate(timesteps):
-                                batch_inference_latents = batch_inference_latents.to(batch_inference_embedddings.dtype)
+                                batch_inference_latents = batch_inference_latents.to(batch_inference_embeddings.dtype)

                                # model prediction (v-prediction, eps, x)
                                model_pred = self.unet(
                                    batch_inference_latents,
                                    t,
                                    timestep_cond=w_embedding,
-                                    encoder_hidden_states=batch_inference_embedddings,
+                                    encoder_hidden_states=batch_inference_embeddings,
                                    cross_attention_kwargs=self.cross_attention_kwargs,
                                    return_dict=False,
                                )[0]
@@ -924,8 +924,8 @@ class LatentConsistencyModelWalkPipeline(
                                    callback_outputs = callback_on_step_end(self, index, t, callback_kwargs)

                                    batch_inference_latents = callback_outputs.pop("latents", batch_inference_latents)
-                                    batch_inference_embedddings = callback_outputs.pop(
-                                        "prompt_embeds", batch_inference_embedddings
+                                    batch_inference_embeddings = callback_outputs.pop(
+                                        "prompt_embeds", batch_inference_embeddings
                                    )
                                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
                                    denoised = callback_outputs.pop("denoised", denoised)
@@ -939,7 +939,7 @@ class LatentConsistencyModelWalkPipeline(
                                        step_idx = index // getattr(self.scheduler, "order", 1)
                                        callback(step_idx, t, batch_inference_latents)

-                        denoised = denoised.to(batch_inference_embedddings.dtype)
+                        denoised = denoised.to(batch_inference_embeddings.dtype)

                        # Note: This is not supported because you would get black images in your latent walk if
                        #       NSFW concept is detected
@@ -348,7 +348,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -530,7 +530,7 @@ class LLMGroundedDiffusionPipeline(
                )

        if len(phrases) != len(boxes):
-            ValueError(
+            raise ValueError(
                "length of `phrases` and `boxes` has to be same, but"
                f" got: `phrases` {len(phrases)} != `boxes` {len(boxes)}"
            )
@@ -439,7 +439,9 @@ class StableDiffusionLongPromptWeightingPipeline(
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

+    model_cpu_offload_seq = "text_encoder-->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]

    def __init__(
        self,
@@ -164,7 +164,7 @@ def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
        text_tokens (list)
            A list contains token ids
        text_weight (list)
-            A list contains the correspodent weight of token ids
+            A list contains the correspondent weight of token ids

    Example:
        import torch
@@ -1028,7 +1028,7 @@ class SDXLLongPromptWeightingPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -1531,7 +1531,7 @@ class SDXLLongPromptWeightingPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

@@ -2131,7 +2131,7 @@ class SDXLLongPromptWeightingPipeline(
            **kwargs,
        )

-    # Overrride to properly handle the loading and unloading of the additional text encoder.
+    # Override to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
        # it here explicitly to be able to tell that it's coming from an SDXL
@@ -18,6 +18,7 @@
 # --------------------------------------------------------------------------


+import logging
 import math
 from typing import Dict, Union

@@ -25,6 +26,7 @@ import matplotlib
 import numpy as np
 import torch
 from PIL import Image
+from PIL.Image import Resampling
 from scipy.optimize import minimize
 from torch.utils.data import DataLoader, TensorDataset
 from tqdm.auto import tqdm
@@ -34,13 +36,14 @@ from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
    DiffusionPipeline,
+    LCMScheduler,
    UNet2DConditionModel,
 )
 from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.28.0.dev0")
+check_min_version("0.25.0")


 class MarigoldDepthOutput(BaseOutput):
@@ -61,6 +64,19 @@ class MarigoldDepthOutput(BaseOutput):
    uncertainty: Union[None, np.ndarray]


+def get_pil_resample_method(method_str: str) -> Resampling:
+    resample_method_dic = {
+        "bilinear": Resampling.BILINEAR,
+        "bicubic": Resampling.BICUBIC,
+        "nearest": Resampling.NEAREST,
+    }
+    resample_method = resample_method_dic.get(method_str, None)
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {resample_method}")
+    else:
+        return resample_method
+
+
 class MarigoldPipeline(DiffusionPipeline):
    """
    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
@@ -113,7 +129,9 @@ class MarigoldPipeline(DiffusionPipeline):
        ensemble_size: int = 10,
        processing_res: int = 768,
        match_input_res: bool = True,
+        resample_method: str = "bilinear",
        batch_size: int = 0,
+        seed: Union[int, None] = None,
        color_map: str = "Spectral",
        show_progress_bar: bool = True,
        ensemble_kwargs: Dict = None,
@@ -129,7 +147,9 @@ class MarigoldPipeline(DiffusionPipeline):
                If set to 0: will not resize at all.
            match_input_res (`bool`, *optional*, defaults to `True`):
                Resize depth prediction to match input resolution.
-                Only valid if `limit_input_res` is not None.
+                Only valid if `processing_res` > 0.
+            resample_method: (`str`, *optional*, defaults to `bilinear`):
+                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
            denoising_steps (`int`, *optional*, defaults to `10`):
                Number of diffusion denoising steps (DDIM) during inference.
            ensemble_size (`int`, *optional*, defaults to `10`):
@@ -137,6 +157,8 @@ class MarigoldPipeline(DiffusionPipeline):
            batch_size (`int`, *optional*, defaults to `0`):
                Inference batch size, no bigger than `num_ensemble`.
                If set to 0, the script will automatically decide the proper batch size.
+            seed (`int`, *optional*, defaults to `None`)
+                Reproducibility seed.
            show_progress_bar (`bool`, *optional*, defaults to `True`):
                Display a progress bar of diffusion denoising.
            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
@@ -146,8 +168,7 @@ class MarigoldPipeline(DiffusionPipeline):
        Returns:
            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
-            - **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
-                    values in [0, 1]. None if `color_map` is `None`
+            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
                    coming from ensembling. None if `ensemble_size = 1`
        """
@@ -158,13 +179,21 @@ class MarigoldPipeline(DiffusionPipeline):
        if not match_input_res:
            assert processing_res is not None, "Value error: `resize_output_back` is only valid with "
        assert processing_res >= 0
-        assert denoising_steps >= 1
        assert ensemble_size >= 1

+        # Check if denoising step is reasonable
+        self._check_inference_step(denoising_steps)
+
+        resample_method: Resampling = get_pil_resample_method(resample_method)
+
        # ----------------- Image Preprocess -----------------
        # Resize image
        if processing_res > 0:
-            input_image = self.resize_max_res(input_image, max_edge_resolution=processing_res)
+            input_image = self.resize_max_res(
+                input_image,
+                max_edge_resolution=processing_res,
+                resample_method=resample_method,
+            )
        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
        input_image = input_image.convert("RGB")
        image = np.asarray(input_image)
@@ -203,9 +232,10 @@ class MarigoldPipeline(DiffusionPipeline):
                rgb_in=batched_img,
                num_inference_steps=denoising_steps,
                show_pbar=show_progress_bar,
+                seed=seed,
            )
-            depth_pred_ls.append(depth_pred_raw.detach().clone())
-        depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
+            depth_pred_ls.append(depth_pred_raw.detach())
+        depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze()
        torch.cuda.empty_cache()  # clear vram cache for ensembling

        # ----------------- Test-time ensembling -----------------
@@ -227,7 +257,7 @@ class MarigoldPipeline(DiffusionPipeline):
        # Resize back to original resolution
        if match_input_res:
            pred_img = Image.fromarray(depth_pred)
-            pred_img = pred_img.resize(input_size)
+            pred_img = pred_img.resize(input_size, resample=resample_method)
            depth_pred = np.asarray(pred_img)

        # Clip output range
@@ -243,12 +273,32 @@ class MarigoldPipeline(DiffusionPipeline):
            depth_colored_img = Image.fromarray(depth_colored_hwc)
        else:
            depth_colored_img = None
+
        return MarigoldDepthOutput(
            depth_np=depth_pred,
            depth_colored=depth_colored_img,
            uncertainty=pred_uncert,
        )

+    def _check_inference_step(self, n_step: int):
+        """
+        Check if denoising step is reasonable
+        Args:
+            n_step (`int`): denoising steps
+        """
+        assert n_step >= 1
+
+        if isinstance(self.scheduler, DDIMScheduler):
+            if n_step < 10:
+                logging.warning(
+                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
+                )
+        elif isinstance(self.scheduler, LCMScheduler):
+            if not 1 <= n_step <= 4:
+                logging.warning(f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps.")
+        else:
+            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
+
    def _encode_empty_text(self):
        """
        Encode text embedding for empty prompt.
@@ -265,7 +315,13 @@ class MarigoldPipeline(DiffusionPipeline):
        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)

    @torch.no_grad()
-    def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int, show_pbar: bool) -> torch.Tensor:
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+        num_inference_steps: int,
+        seed: Union[int, None],
+        show_pbar: bool,
+    ) -> torch.Tensor:
        """
        Perform an individual depth prediction without ensembling.

@@ -286,10 +342,20 @@ class MarigoldPipeline(DiffusionPipeline):
        timesteps = self.scheduler.timesteps  # [T]

        # Encode image
-        rgb_latent = self._encode_rgb(rgb_in)
+        rgb_latent = self.encode_rgb(rgb_in)

        # Initial depth map (noise)
-        depth_latent = torch.randn(rgb_latent.shape, device=device, dtype=self.dtype)  # [B, 4, h, w]
+        if seed is None:
+            rand_num_generator = None
+        else:
+            rand_num_generator = torch.Generator(device=device)
+            rand_num_generator.manual_seed(seed)
+        depth_latent = torch.randn(
+            rgb_latent.shape,
+            device=device,
+            dtype=self.dtype,
+            generator=rand_num_generator,
+        )  # [B, 4, h, w]

        # Batched empty text embedding
        if self.empty_text_embed is None:
@@ -314,9 +380,9 @@ class MarigoldPipeline(DiffusionPipeline):
            noise_pred = self.unet(unet_input, t, encoder_hidden_states=batch_empty_text_embed).sample  # [B, 4, h, w]

            # compute the previous noisy sample x_t -> x_t-1
-            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
-        torch.cuda.empty_cache()
-        depth = self._decode_depth(depth_latent)
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent, generator=rand_num_generator).prev_sample
+
+        depth = self.decode_depth(depth_latent)

        # clip prediction
        depth = torch.clip(depth, -1.0, 1.0)
@@ -325,7 +391,7 @@ class MarigoldPipeline(DiffusionPipeline):

        return depth

-    def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
        """
        Encode RGB image into latent.

@@ -344,7 +410,7 @@ class MarigoldPipeline(DiffusionPipeline):
        rgb_latent = mean * self.rgb_latent_scale_factor
        return rgb_latent

-    def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
        """
        Decode depth latent into depth map.

@@ -365,7 +431,7 @@ class MarigoldPipeline(DiffusionPipeline):
        return depth_mean

    @staticmethod
-    def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
+    def resize_max_res(img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR) -> Image.Image:
        """
        Resize image to limit maximum edge length while keeping aspect ratio.

@@ -374,6 +440,8 @@ class MarigoldPipeline(DiffusionPipeline):
                Image to be resized.
            max_edge_resolution (`int`):
                Maximum edge length (pixel).
+            resample_method (`PIL.Image.Resampling`):
+                Resampling method used to resize images.

        Returns:
            `Image.Image`: Resized image.
@@ -384,7 +452,7 @@ class MarigoldPipeline(DiffusionPipeline):
        new_width = int(original_width * downscale_factor)
        new_height = int(original_height * downscale_factor)

-        resized_img = img.resize((new_width, new_height))
+        resized_img = img.resize((new_width, new_height), resample=resample_method)
        return resized_img

    @staticmethod
@@ -196,7 +196,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
-            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
+            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
            cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.

@@ -325,7 +325,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

-        # Mask for tile weights strenght
+        # Mask for tile weights strength
        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)

        # Diffusion timesteps
@@ -832,7 +832,7 @@ class AnimateDiffControlNetPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            allback_on_step_end (`Callable`, *optional*):
+            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
@@ -840,7 +840,7 @@ class AnimateDiffControlNetPipeline(
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

@@ -1280,7 +1280,7 @@ class DemoFusionSDXLPipeline(

        return output_images

-    # Overrride to properly handle the loading and unloading of the additional text encoder.
+    # Override to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
        # it here explicitly to be able to tell that it's coming from an SDXL
@@ -887,7 +887,7 @@ class StyleAlignedSDXLPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -26,7 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
+from diffusers.pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
 from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
 from diffusers.utils import (
    USE_PEFT_BACKEND,
@@ -206,7 +206,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
            dimensions: ``batch x channels x height x width``.
    """

-    # checkpoint. TOD(Yiyi) - need to clean this up later
+    # checkpoint. #TODO(Yiyi) - need to clean this up later
    if image is None:
        raise ValueError("`image` input cannot be undefined.")

@@ -277,7 +277,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
        # images are in latent space and thus can't
        # be masked set masked_image to None
        # we assume that the checkpoint is not an inpainting
-        # checkpoint. TOD(Yiyi) - need to clean this up later
+        # checkpoint. #TODO(Yiyi) - need to clean this up later
        masked_image = None
    else:
        masked_image = image * (mask < 0.5)
@@ -1073,7 +1073,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
                # because `num_inference_steps` might be even given that every timestep
                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
                # mean that we cut the timesteps in the middle of the denoising step
-                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
                num_inference_steps = num_inference_steps + 1

@@ -46,6 +46,11 @@ except Exception:

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

+logger.warning(
+    "To use instant id pipelines, please make sure you have the `insightface` library installed: `pip install insightface`."
+    "Please refer to: https://huggingface.co/InstantX/InstantID for further instructions regarding inference"
+)
+

 def FeedForward(dim, mult=4):
    inner_dim = int(dim * mult)
@@ -701,7 +706,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

@@ -81,7 +81,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -1,6 +1,7 @@
 """
-    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 """
+
 import inspect
 from typing import Callable, List, Optional, Union

@@ -224,7 +224,7 @@ class StableDiffusionIPEXPipeline(
        # 5. Prepare latent variables
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
-            self.unet.in_channels,
+            self.unet.config.in_channels,
            height,
            width,
            prompt_embeds.dtype,
@@ -679,7 +679,7 @@ class StableDiffusionIPEXPipeline(
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -917,7 +917,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            text_embeddings = self.__encode_prompt(prompt, negative_prompt)

            # Pre-initialize latents
-            num_channels_latents = self.unet.in_channels
+            num_channels_latents = self.unet.config.in_channels
            latents = self.prepare_latents(
                batch_size,
                num_channels_latents,
@@ -35,7 +35,6 @@ def slerp(val, low, high):


 class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
-
    """
    Pipeline for prompt-to-prompt interpolation on CLIP text embeddings and using the UnCLIP / Dall-E to decode them to images.

@@ -49,7 +48,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
@@ -23,6 +23,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -238,6 +239,10 @@ class SDText2ImageDataset:

 def log_validation(vae, unet, args, accelerator, weight_dtype, step):
    logger.info("Running validation... ")
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)

    unet = accelerator.unwrap_model(unet)
    pipeline = StableDiffusionPipeline.from_pretrained(
@@ -274,7 +279,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with torch.autocast("cuda", dtype=weight_dtype):
+        with autocast_ctx:
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -1172,6 +1177,11 @@ def main(args):
    ).input_ids.to(accelerator.device)
    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]

+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
    # 16. Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1300,7 +1310,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    with torch.autocast("cuda"):
+                    with autocast_ctx:
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1359,7 +1369,7 @@ def main(args):
                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
-                    with torch.autocast("cuda", dtype=weight_dtype):
+                    with autocast_ctx:
                        target_noise_pred = unet(
                            x_prev.float(),
                            timesteps,
@@ -22,6 +22,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -146,7 +147,12 @@ def log_validation(vae, args, accelerator, weight_dtype, step, unet=None, is_fin

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with torch.autocast("cuda", dtype=weight_dtype):
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+        with autocast_ctx:
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -24,6 +24,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -256,6 +257,10 @@ class SDXLText2ImageDataset:

 def log_validation(vae, unet, args, accelerator, weight_dtype, step):
    logger.info("Running validation... ")
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)

    unet = accelerator.unwrap_model(unet)
    pipeline = StableDiffusionXLPipeline.from_pretrained(
@@ -291,7 +296,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with torch.autocast("cuda", dtype=weight_dtype):
+        with autocast_ctx:
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -1353,7 +1358,12 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    with torch.autocast("cuda"):
+                    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1416,7 +1426,12 @@ def main(args):
                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
-                    with torch.autocast("cuda", enabled=True, dtype=weight_dtype):
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
                        target_noise_pred = unet(
                            x_prev.float(),
                            timesteps,
@@ -23,6 +23,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -252,7 +253,12 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with torch.autocast("cuda"):
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -939,7 +945,7 @@ def main(args):

    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
    # Initialize from (online) unet
-    target_unet = UNet2DConditionModel(**teacher_unet.config)
+    target_unet = UNet2DConditionModel.from_config(unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
    target_unet.requires_grad_(False)
@@ -1257,7 +1263,12 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    with torch.autocast("cuda"):
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1315,7 +1326,12 @@ def main(args):

                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
-                    with torch.autocast("cuda", dtype=weight_dtype):
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
                        target_noise_pred = target_unet(
                            x_prev.float(),
                            timesteps,
@@ -24,6 +24,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -270,7 +271,12 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with torch.autocast("cuda"):
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(accelerator.device.type)
+
+        with autocast_ctx:
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -998,7 +1004,7 @@ def main(args):

    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
    # Initialize from (online) unet
-    target_unet = UNet2DConditionModel(**teacher_unet.config)
+    target_unet = UNet2DConditionModel.from_config(unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
    target_unet.requires_grad_(False)
@@ -1355,7 +1361,12 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    with torch.autocast("cuda"):
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type)
+
+                    with autocast_ctx:
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1417,7 +1428,12 @@ def main(args):

                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
-                    with torch.autocast("cuda", dtype=weight_dtype):
+                    if torch.backends.mps.is_available():
+                        autocast_ctx = nullcontext()
+                    else:
+                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
+
+                    with autocast_ctx:
                        target_noise_pred = target_unet(
                            x_prev.float(),
                            timesteps,
@@ -752,6 +752,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and

 import argparse
-import contextlib
 import functools
 import gc
 import logging
@@ -22,6 +21,7 @@ import math
 import os
 import random
 import shutil
+from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -125,7 +125,10 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
        )

    image_logs = []
-    inference_ctx = contextlib.nullcontext() if is_final_validation else torch.autocast("cuda")
+    if is_final_validation or torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)

    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
        validation_image = Image.open(validation_image).convert("RGB")
@@ -134,7 +137,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
        images = []

        for _ in range(args.num_validation_images):
-            with inference_ctx:
+            with autocast_ctx:
                image = pipeline(
                    prompt=validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
                ).images[0]
@@ -792,6 +795,12 @@ def main(args):

    logging_dir = Path(args.output_dir, args.logging_dir)

+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

    accelerator = Accelerator(
@@ -801,6 +810,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -676,6 +676,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -259,13 +259,17 @@ The authors found that by using DoRA, both the learning capacity and training st
 > This is also aligned with some of the quantitative analysis shown in the paper. 

 **Usage**
-1. To use DoRA you need to install `peft` from main: 
+1. To use DoRA you need to upgrade the installation of `peft`: 
 ```bash
-pip install git+https://github.com/huggingface/peft.git
+pip install-U peft
 ```
 2. Enable DoRA training by adding this flag
 ```bash
 --use_dora
 ```
 **Inference** 
-The inference is the same as if you train a regular LoRA 🤗
+The inference is the same as if you train a regular LoRA 🤗
+
+## Format compatibility
+
+You can pass `--output_kohya_format` to additionally generate a state dictionary which should be compatible with other platforms and tools such as Automatic 1111, Comfy, Kohya, etc. The `output_dir` will contain a file named "pytorch_lora_weights_kohya.safetensors".
@@ -821,6 +821,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -749,6 +749,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and

 import argparse
-import contextlib
 import gc
 import itertools
 import json
@@ -24,6 +23,7 @@ import os
 import random
 import shutil
 import warnings
+from contextlib import nullcontext
 from pathlib import Path

 import numpy as np
@@ -41,6 +41,7 @@ from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
+from safetensors.torch import load_file, save_file
 from torch.utils.data import Dataset
 from torchvision import transforms
 from torchvision.transforms.functional import crop
@@ -62,7 +63,9 @@ from diffusers.optimization import get_scheduler
 from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
 from diffusers.utils import (
    check_min_version,
+    convert_all_state_dict_to_peft,
    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
    convert_unet_state_dict_to_peft,
    is_wandb_available,
 )
@@ -205,11 +208,12 @@ def log_validation(
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-    inference_ctx = (
-        contextlib.nullcontext() if "playground" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast()
-    )
+    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)

-    with inference_ctx:
+    with autocast_ctx:
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]

    for tracker in accelerator.trackers:
@@ -227,7 +231,8 @@ def log_validation(
            )

    del pipeline
-    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()

    return images

@@ -396,6 +401,11 @@ def parse_args(input_args=None):
        default="lora-dreambooth-model",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
+    parser.add_argument(
+        "--output_kohya_format",
+        action="store_true",
+        help="Flag to additionally generate final state dict in the Kohya format so that it becomes compatible with A111, Comfy, Kohya, etc.",
+    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--resolution",
@@ -959,6 +969,12 @@ def main(args):
    if args.do_edm_style_training and args.snr_gamma is not None:
        raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")

+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
@@ -971,6 +987,10 @@ def main(args):
        kwargs_handlers=[kwargs],
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -1001,7 +1021,8 @@ def main(args):
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
            if args.prior_generation_precision == "fp32":
                torch_dtype = torch.float32
            elif args.prior_generation_precision == "fp16":
@@ -1126,6 +1147,12 @@ def main(args):
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
    # Move unet, vae and text_encoder to device and cast to weight_dtype
    unet.to(accelerator.device, dtype=weight_dtype)

@@ -1270,7 +1297,7 @@ def main(args):

    # Enable TF32 for faster training on Ampere GPUs,
    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
+    if args.allow_tf32 and torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = True

    if args.scale_lr:
@@ -1447,7 +1474,8 @@ def main(args):
    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
        del tokenizers, text_encoders
        gc.collect()
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()

    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
    # pack the statically computed variables appropriately here. This is so that we don't
@@ -1890,6 +1918,11 @@ def main(args):
            text_encoder_lora_layers=text_encoder_lora_layers,
            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
        )
+        if args.output_kohya_format:
+            lora_state_dict = load_file(f"{args.output_dir}/pytorch_lora_weights.safetensors")
+            peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict)
+            kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+            save_file(kohya_state_dict, f"{args.output_dir}/pytorch_lora_weights_kohya.safetensors")

        # Final inference
        # Load previous pipeline
@@ -21,6 +21,7 @@ import logging
 import math
 import os
 import shutil
+from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -52,6 +53,9 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module


+if is_wandb_available():
+    import wandb
+
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.28.0.dev0")

@@ -63,6 +67,48 @@ DATASET_NAME_MAPPING = {
 WANDB_TABLE_COL_NAMES = ["original_image", "edited_image", "edit_prompt"]


+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    generator,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    original_image = download_image(args.val_image_url)
+    edited_images = []
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
+        for _ in range(args.num_validation_images):
+            edited_images.append(
+                pipeline(
+                    args.validation_prompt,
+                    image=original_image,
+                    num_inference_steps=20,
+                    image_guidance_scale=1.5,
+                    guidance_scale=7,
+                    generator=generator,
+                ).images[0]
+            )
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "wandb":
+            wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+            for edited_image in edited_images:
+                wandb_table.add_data(wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt)
+            tracker.log({"validation": wandb_table})
+
+
 def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script for InstructPix2Pix.")
    parser.add_argument(
@@ -404,12 +450,11 @@ def main():
        project_config=accelerator_project_config,
    )

-    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False

-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-        import wandb
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
@@ -512,7 +557,8 @@ def main():
                    model.save_pretrained(os.path.join(output_dir, "unet"))

                    # make sure to pop weight so that corresponding model is not saved again
-                    weights.pop()
+                    if weights:
+                        weights.pop()

        def load_model_hook(models, input_dir):
            if args.use_ema:
@@ -918,11 +964,6 @@ def main():
                and (args.validation_prompt is not None)
                and (epoch % args.validation_epochs == 0)
            ):
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                # create pipeline
                if args.use_ema:
                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
                    ema_unet.store(unet.parameters())
@@ -937,35 +978,14 @@ def main():
                    variant=args.variant,
                    torch_dtype=weight_dtype,
                )
-                pipeline = pipeline.to(accelerator.device)
-                pipeline.set_progress_bar_config(disable=True)

-                # run inference
-                original_image = download_image(args.val_image_url)
-                edited_images = []
-                with torch.autocast(
-                    str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"
-                ):
-                    for _ in range(args.num_validation_images):
-                        edited_images.append(
-                            pipeline(
-                                args.validation_prompt,
-                                image=original_image,
-                                num_inference_steps=20,
-                                image_guidance_scale=1.5,
-                                guidance_scale=7,
-                                generator=generator,
-                            ).images[0]
-                        )
+                log_validation(
+                    pipeline,
+                    args,
+                    accelerator,
+                    generator,
+                )

-                for tracker in accelerator.trackers:
-                    if tracker.name == "wandb":
-                        wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
-                        for edited_image in edited_images:
-                            wandb_table.add_data(
-                                wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
-                            )
-                        tracker.log({"validation": wandb_table})
                if args.use_ema:
                    # Switch back to the original UNet parameters.
                    ema_unet.restore(unet.parameters())
@@ -976,7 +996,6 @@ def main():
    # Create the pipeline using the trained modules and save it.
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
-        unet = unwrap_model(unet)
        if args.use_ema:
            ema_unet.copy_to(unet.parameters())

@@ -984,7 +1003,7 @@ def main():
            args.pretrained_model_name_or_path,
            text_encoder=unwrap_model(text_encoder),
            vae=unwrap_model(vae),
-            unet=unet,
+            unet=unwrap_model(unet),
            revision=args.revision,
            variant=args.variant,
        )
@@ -998,31 +1017,13 @@ def main():
                ignore_patterns=["step_*", "epoch_*"],
            )

-        if args.validation_prompt is not None:
-            edited_images = []
-            pipeline = pipeline.to(accelerator.device)
-            with torch.autocast(str(accelerator.device).replace(":0", "")):
-                for _ in range(args.num_validation_images):
-                    edited_images.append(
-                        pipeline(
-                            args.validation_prompt,
-                            image=original_image,
-                            num_inference_steps=20,
-                            image_guidance_scale=1.5,
-                            guidance_scale=7,
-                            generator=generator,
-                        ).images[0]
-                    )
-
-            for tracker in accelerator.trackers:
-                if tracker.name == "wandb":
-                    wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
-                    for edited_image in edited_images:
-                        wandb_table.add_data(
-                            wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
-                        )
-                    tracker.log({"test": wandb_table})
-
+        if (args.val_image_url is not None) and (args.validation_prompt is not None):
+            log_validation(
+                pipeline,
+                args,
+                accelerator,
+                generator,
+            )
    accelerator.end_training()


@@ -20,6 +20,7 @@ import math
 import os
 import shutil
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from urllib.parse import urlparse

@@ -70,14 +71,7 @@ WANDB_TABLE_COL_NAMES = ["file_name", "edited_image", "edit_prompt"]
 TORCH_DTYPE_MAPPING = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}


-def log_validation(
-    pipeline,
-    args,
-    accelerator,
-    generator,
-    global_step,
-    is_final_validation=False,
-):
+def log_validation(pipeline, args, accelerator, generator, global_step, is_final_validation=False):
    logger.info(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
        f" {args.validation_prompt}."
@@ -96,7 +90,12 @@ def log_validation(
        else Image.open(image_url_or_path).convert("RGB")
    )(args.val_image_url_or_path)

-    with torch.autocast(str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"):
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+
+    with autocast_ctx:
        edited_images = []
        # Run inference
        for val_img_idx in range(args.num_validation_images):
@@ -497,6 +496,13 @@ def main():
            ),
        )
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -505,6 +511,10 @@ def main():
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

    # Make one log on every process with the configuration for debugging.
@@ -458,6 +458,10 @@ def main():
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -343,6 +343,11 @@ def main():
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -356,6 +356,11 @@ def main():
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -459,6 +459,10 @@ def main():
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -916,6 +916,10 @@ def main(args):
        project_config=accelerator_project_config,
    )

+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -1,58 +0,0 @@
-# !pip install opencv-python transformers accelerate
-import argparse
-
-import cv2
-import numpy as np
-import torch
-from controlnetxs import ControlNetXSModel
-from PIL import Image
-from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
-
-from diffusers.utils import load_image
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
-)
-parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches")
-parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7)
-parser.add_argument(
-    "--image_path",
-    type=str,
-    default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png",
-)
-parser.add_argument("--num_inference_steps", type=int, default=50)
-
-args = parser.parse_args()
-
-prompt = args.prompt
-negative_prompt = args.negative_prompt
-# download an image
-image = load_image(args.image_path)
-
-# initialize the models and pipeline
-controlnet_conditioning_scale = args.controlnet_conditioning_scale
-controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16)
-pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float16
-)
-pipe.enable_model_cpu_offload()
-
-# get canny image
-image = np.array(image)
-image = cv2.Canny(image, 100, 200)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
-
-num_inference_steps = args.num_inference_steps
-
-# generate image
-image = pipe(
-    prompt,
-    controlnet_conditioning_scale=controlnet_conditioning_scale,
-    image=canny_image,
-    num_inference_steps=num_inference_steps,
-).images[0]
-image.save("cnxs_sd.canny.png")
@@ -1,57 +0,0 @@
-# !pip install opencv-python transformers accelerate
-import argparse
-
-import cv2
-import numpy as np
-import torch
-from controlnetxs import ControlNetXSModel
-from PIL import Image
-from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
-
-from diffusers.utils import load_image
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
-)
-parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches")
-parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7)
-parser.add_argument(
-    "--image_path",
-    type=str,
-    default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png",
-)
-parser.add_argument("--num_inference_steps", type=int, default=50)
-
-args = parser.parse_args()
-
-prompt = args.prompt
-negative_prompt = args.negative_prompt
-# download an image
-image = load_image(args.image_path)
-# initialize the models and pipeline
-controlnet_conditioning_scale = args.controlnet_conditioning_scale
-controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16)
-pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
-)
-pipe.enable_model_cpu_offload()
-
-# get canny image
-image = np.array(image)
-image = cv2.Canny(image, 100, 200)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
-
-num_inference_steps = args.num_inference_steps
-
-# generate image
-image = pipe(
-    prompt,
-    controlnet_conditioning_scale=controlnet_conditioning_scale,
-    image=canny_image,
-    num_inference_steps=num_inference_steps,
-).images[0]
-image.save("cnxs_sdxl.canny.png")
--- a/Show More
+++ b/Show More