Release: v0.30.2

update runway repo for single_file (#9323 )
update to a place holder
2024-08-30 23:28:03 +00:00 · 2024-08-30 23:26:24 +00:00 · 2024-08-30 23:26:01 +00:00 · 2024-08-30 23:22:40 +00:00 · 2024-08-23 15:24:29 -10:00 · 2024-08-23 15:19:50 -10:00
336 changed files with 32840 additions and 5358 deletions
@@ -13,13 +13,13 @@ env:

 jobs:
  torch_pipelines_cuda_benchmark_tests:
-    env: 
+    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }}
    name: Torch Core Pipelines CUDA Benchmarking Tests
    strategy:
      fail-fast: false
      max-parallel: 1
-    runs-on: 
+    runs-on:
      group: aws-g6-4xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-compile-cuda
@@ -59,7 +59,7 @@ jobs:
        if: ${{ success() }}
        run: |
          pip install requests && python utils/notify_benchmarking_status.py --status=success
-        
+
      - name: Report failure status
        if: ${{ failure() }}
        run: |
@@ -20,7 +20,8 @@ env:

 jobs:
  test-build-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -50,7 +51,8 @@ jobs:
        if: steps.file_changes.outputs.all != ''

  build-and-push-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
    if: github.event_name != 'pull_request'

    permissions:
@@ -98,4 +100,4 @@ jobs:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@@ -24,7 +24,7 @@ jobs:
  mirror_community_pipeline:
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }}
-    
+
    runs-on: ubuntu-latest
    steps:
      # Checkout to correct ref
@@ -95,7 +95,7 @@ jobs:
        if: ${{ success() }}
        run: |
          pip install requests && python utils/notify_community_pipelines_mirror.py --status=success
-      
+
      - name: Report failure status
        if: ${{ failure() }}
        run: |
@@ -7,7 +7,7 @@ on:

 env:
  DIFFUSERS_IS_CI: yes
-  HF_HOME: /mnt/cache
+  HF_HUB_ENABLE_HF_TRANSFER: 1
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 600
@@ -18,8 +18,11 @@ env:

 jobs:
  setup_torch_cuda_pipeline_matrix:
-    name: Setup Torch Pipelines Matrix
-    runs-on: diffusers/diffusers-pytorch-cpu
+    name: Setup Torch Pipelines CUDA Slow Tests Matrix
+    runs-on:
+      group: aws-general-8-plus
+    container:
+      image: diffusers/diffusers-pytorch-cpu
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -27,13 +30,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
      - name: Install dependencies
        run: |
-          pip install -e .
+          pip install -e .[test]
          pip install huggingface_hub
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
@@ -50,16 +49,18 @@ jobs:
          path: reports

  run_nightly_tests_for_torch_pipelines:
-    name: Torch Pipelines CUDA Nightly Tests
+    name: Nightly Torch Pipelines CUDA Tests
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
+      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -67,19 +68,16 @@ jobs:
          fetch-depth: 2
      - name: NVIDIA-SMI
        run: nvidia-smi
-
      - name: Install dependencies
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
-
      - name: Environment
        run: |
          python utils/print_env.py
-
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
+      - name: Pipeline CUDA Test
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -90,38 +88,37 @@ jobs:
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
            tests/pipelines/${{ matrix.module }}
-
      - name: Failure short reports
        if: ${{ failure() }}
        run: |
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
-
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
-
      - name: Generate Report and Notify Channel
        if: always()
        run: |
          pip install slack_sdk tabulate
-          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_for_other_torch_modules:
-    name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    name: Nightly Torch CUDA Tests
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus 0
    defaults:
      run:
        shell: bash
    strategy:
+      max-parallel: 2
      matrix:
-        module: [models, schedulers, others, examples]
+        module: [models, schedulers, lora, others, single_file, examples]
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -133,8 +130,8 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m uv pip install pytest-reportlog
-
    - name: Environment
      run: python utils/print_env.py

@@ -158,7 +155,6 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v --make-reports=examples_torch_cuda \
          --report-log=examples_torch_cuda.log \
@@ -181,64 +177,7 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
-  run_lora_nightly_tests:
-    name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        python -m uv pip install pytest-reportlog
-
-    - name: Environment
-      run: python utils/print_env.py
-
-    - name: Run nightly LoRA tests with PEFT and Torch
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
-          --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \
-          tests/lora
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_lora_cuda_stats.txt
-        cat reports/tests_torch_lora_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: torch_lora_cuda_test_reports
-        path: reports
-
-    - name: Generate Report and Notify Channel
-      if: always()
-      run: |
-        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
@@ -294,14 +233,15 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-onnxruntime-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -318,11 +258,10 @@ jobs:
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
-
    - name: Environment
      run: python utils/print_env.py

-    - name: Run nightly ONNXRuntime CUDA tests
+    - name: Run Nightly ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -349,7 +288,7 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
@@ -411,4 +350,4 @@ jobs:
        if: always()
        run: |
          pip install slack_sdk tabulate
-          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
@@ -15,7 +15,8 @@ concurrency:
 jobs:
  setup_pr_tests:
    name: Setup PR Tests
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +74,8 @@ jobs:
      max-parallel: 2
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,12 +125,13 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

    name: ${{ matrix.config.name }}
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
    container:
      image: ${{ matrix.config.image }}
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -71,7 +71,8 @@ jobs:

    name: LoRA - ${{ matrix.lib-versions }}

-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -128,4 +129,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
-        path: reports
+        path: reports
@@ -77,28 +77,29 @@ jobs:
        config:
          - name: Fast PyTorch Pipeline CPU tests
            framework: pytorch_pipelines
-            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
+            runner: aws-highmemory-32-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -180,7 +181,8 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner:
+              group: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -11,17 +11,16 @@ on:

 env:
  DIFFUSERS_IS_CI: yes
-  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 600
-  RUN_SLOW: yes
  PIPELINE_USAGE_CUTOFF: 50000

 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -52,17 +51,18 @@ jobs:
          path: reports

  torch_pipelines_cuda_tests:
-    name: Torch Pipelines CUDA Slow Tests
+    name: Torch Pipelines CUDA Tests
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -103,10 +103,11 @@ jobs:

  torch_cuda_tests:
    name: Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host --gpus 0
    defaults:
      run:
        shell: bash
@@ -124,12 +125,13 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run slow PyTorch CUDA tests
+    - name: Run PyTorch CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -153,61 +155,6 @@ jobs:
        name: torch_cuda_test_reports
        path: reports

-  peft_cuda_tests:
-    name: PEFT CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
-    defaults:
-      run:
-        shell: bash
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m pip install -U peft@git+https://github.com/huggingface/peft.git
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Run slow PEFT CUDA tests
-      env:
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx and not PEFTLoRALoading" \
-          --make-reports=tests_peft_cuda \
-          tests/lora/
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "lora and not Flax and not Onnx and not PEFTLoRALoading" \
-          --make-reports=tests_peft_cuda_models_lora \
-          tests/models/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_peft_cuda_stats.txt
-        cat reports/tests_peft_cuda_failures_short.txt
-        cat reports/tests_peft_cuda_models_lora_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v2
-      with:
-        name: torch_peft_test_reports
-        path: reports
-
  flax_tpu_tests:
    name: Flax TPU Tests
    runs-on: docker-tpu
@@ -257,7 +204,8 @@ jobs:

  onnx_cuda_tests:
    name: ONNX CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -305,11 +253,12 @@ jobs:
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge

    container:
      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -347,11 +296,12 @@ jobs:
  run_xformers_tests:
    name: PyTorch xformers CUDA tests

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -388,11 +338,12 @@ jobs:
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu

-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
    - name: Checkout diffusers
@@ -29,28 +29,29 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -26,7 +26,8 @@ env:
 jobs:
  run_tests:
    name: "Run a test on our runner from a PR"
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -70,4 +71,4 @@ jobs:
        env:
            PY_TEST: ${{ github.event.inputs.test }}
        run: |
-          pytest "$PY_TEST"
+          pytest "$PY_TEST"
@@ -19,7 +19,8 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    runs-on:
+      group: aws-highmemory-32-plus
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
@@ -22,7 +22,8 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    runs-on:
+      group: "${{ github.event.inputs.runner_type }}"
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
@@ -63,7 +63,7 @@ In the same spirit, you are of immense help to the community by answering such q

 **Please** keep in mind that the more effort you put into asking or answering a question, the higher
 the quality of the publicly documented knowledge. In the same way, well-posed and well-answered questions create a high-quality knowledge database accessible to everybody, while badly posed questions or answers reduce the overall quality of the public knowledge database.
-In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formated/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.
+In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formatted/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.

 **NOTE about channels**:
 [*The forum*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) is much better indexed by search engines, such as Google. Posts are ranked by popularity rather than chronologically. Hence, it's easier to look up questions and answers that we posted some time ago.
@@ -67,7 +67,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 27.000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 30,000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -209,7 +209,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +12.000 other amazing GitHub repositories 💪
+- +14,000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

@@ -38,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        hf_transfer \
        Jinja2 \
        librosa \
        numpy==1.26.4 \
@@ -38,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    datasets \
    hf-doc-builder \
    huggingface-hub \
+    hf_transfer \
    Jinja2 \
    librosa \
    numpy==1.26.4 \
@@ -38,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    datasets \
    hf-doc-builder \
    huggingface-hub \
+    hf_transfer \
    Jinja2 \
    librosa \
    numpy==1.26.4 \
@@ -38,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
+        hf_transfer \
        Jinja2 \
        librosa \
        numpy==1.26.4 \
@@ -190,6 +190,10 @@
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
  title: Conceptual Guides
+- sections:
+  - local: community_projects
+    title: Projects built with Diffusers
+  title: Community Projects
 - sections:
  - isExpanded: false
    sections:
@@ -235,10 +239,16 @@
      title: VQModel
    - local: api/models/autoencoderkl
      title: AutoencoderKL
+    - local: api/models/autoencoderkl_cogvideox
+      title: AutoencoderKLCogVideoX
    - local: api/models/asymmetricautoencoderkl
      title: AsymmetricAutoencoderKL
+    - local: api/models/stable_cascade_unet
+      title: StableCascadeUNet
    - local: api/models/autoencoder_tiny
      title: Tiny AutoEncoder
+    - local: api/models/autoencoder_oobleck
+      title: Oobleck AutoEncoder
    - local: api/models/consistency_decoder_vae
      title: ConsistencyDecoderVAE
    - local: api/models/transformer2d
@@ -251,14 +261,20 @@
      title: HunyuanDiT2DModel
    - local: api/models/aura_flow_transformer2d
      title: AuraFlowTransformer2DModel
+    - local: api/models/flux_transformer
+      title: FluxTransformer2DModel
    - local: api/models/latte_transformer3d
      title: LatteTransformer3DModel
+    - local: api/models/cogvideox_transformer3d
+      title: CogVideoXTransformer3DModel
    - local: api/models/lumina_nextdit2d
      title: LuminaNextDiT2DModel
    - local: api/models/transformer_temporal
      title: TransformerTemporalModel
    - local: api/models/sd3_transformer2d
      title: SD3Transformer2DModel
+    - local: api/models/stable_audio_transformer
+      title: StableAudioDiTModel
    - local: api/models/prior_transformer
      title: PriorTransformer
    - local: api/models/controlnet
@@ -267,6 +283,8 @@
      title: HunyuanDiT2DControlNetModel
    - local: api/models/controlnet_sd3
      title: SD3ControlNetModel
+    - local: api/models/controlnet_sparsectrl
+      title: SparseControlNetModel
    title: Models
  - isExpanded: false
    sections:
@@ -288,6 +306,8 @@
      title: AutoPipeline
    - local: api/pipelines/blip_diffusion
      title: BLIP-Diffusion
+    - local: api/pipelines/cogvideox
+      title: CogVideoX
    - local: api/pipelines/consistency_models
      title: Consistency Models
    - local: api/pipelines/controlnet
@@ -314,6 +334,8 @@
      title: DiffEdit
    - local: api/pipelines/dit
      title: DiT
+    - local: api/pipelines/flux
+      title: Flux
    - local: api/pipelines/hunyuandit
      title: Hunyuan-DiT
    - local: api/pipelines/i2vgenxl
@@ -360,6 +382,8 @@
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
+    - local: api/pipelines/stable_audio
+      title: Stable Audio
    - local: api/pipelines/stable_cascade
      title: Stable Cascade
    - sections:
@@ -423,6 +447,8 @@
      title: CMStochasticIterativeScheduler
    - local: api/schedulers/consistency_decoder
      title: ConsistencyDecoderScheduler
+    - local: api/schedulers/cosine_dpm
+      title: CosineDPMSolverMultistepScheduler
    - local: api/schedulers/ddim_inverse
      title: DDIMInverseScheduler
    - local: api/schedulers/ddim
@@ -12,10 +12,13 @@ specific language governing permissions and limitations under the License.

 # LoRA

-LoRA is a fast and lightweight training method that inserts and trains a significantly smaller number of parameters instead of all the model parameters. This produces a smaller file (~100 MBs) and makes it easier to quickly train a model to learn a new concept. LoRA weights are typically loaded into the UNet, text encoder or both. There are two classes for loading LoRA weights:
+LoRA is a fast and lightweight training method that inserts and trains a significantly smaller number of parameters instead of all the model parameters. This produces a smaller file (~100 MBs) and makes it easier to quickly train a model to learn a new concept. LoRA weights are typically loaded into the denoiser, text encoder or both. The denoiser usually corresponds to a UNet ([`UNet2DConditionModel`], for example) or a Transformer ([`SD3Transformer2DModel`], for example). There are several classes for loading LoRA weights:

- [`LoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model.
- [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`LoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model.
+- [`StableDiffusionLoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model.
+- [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`StableDiffusionLoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model.
+- [`SD3LoraLoaderMixin`] provides similar functions for [Stable Diffusion 3](https://huggingface.co/blog/sd3).
+- [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
+- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.

 <Tip>

@@ -23,10 +26,22 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse

 </Tip>

-## LoraLoaderMixin
+## StableDiffusionLoraLoaderMixin

-[[autodoc]] loaders.lora.LoraLoaderMixin
+[[autodoc]] loaders.lora_pipeline.StableDiffusionLoraLoaderMixin

 ## StableDiffusionXLLoraLoaderMixin

-[[autodoc]] loaders.lora.StableDiffusionXLLoraLoaderMixin
+[[autodoc]] loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin
+
+## SD3LoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.SD3LoraLoaderMixin
+
+## AmusedLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
+
+## LoraBaseMixin
+
+[[autodoc]] loaders.lora_base.LoraBaseMixin
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # PEFT

-Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`] to load an adapter.
+Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.

 <Tip>

@@ -22,6 +22,7 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:

 ## Supported pipelines

+- [`CogVideoXPipeline`]
 - [`StableDiffusionPipeline`]
 - [`StableDiffusionImg2ImgPipeline`]
 - [`StableDiffusionInpaintPipeline`]
@@ -49,8 +50,10 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 - [`UNet2DConditionModel`]
 - [`StableCascadeUNet`]
 - [`AutoencoderKL`]
+- [`AutoencoderKLCogVideoX`]
 - [`ControlNetModel`]
 - [`SD3Transformer2DModel`]
+- [`FluxTransformer2DModel`]

 ## FromSingleFileMixin

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # UNet

-Some training methods - like LoRA and Custom Diffusion - typically target the UNet's attention layers, but these training methods can also target other non-attention layers. Instead of training all of a model's parameters, only a subset of the parameters are trained, which is faster and more efficient. This class is useful if you're *only* loading weights into a UNet. If you need to load weights into the text encoder or a text encoder and UNet, try using the [`~loaders.LoraLoaderMixin.load_lora_weights`] function instead.
+Some training methods - like LoRA and Custom Diffusion - typically target the UNet's attention layers, but these training methods can also target other non-attention layers. Instead of training all of a model's parameters, only a subset of the parameters are trained, which is faster and more efficient. This class is useful if you're *only* loading weights into a UNet. If you need to load weights into the text encoder or a text encoder and UNet, try using the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] function instead.

 The [`UNet2DConditionLoadersMixin`] class provides functions for loading and saving weights, fusing and unfusing LoRAs, disabling and enabling LoRAs, and setting and deleting adapters.

@@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AutoencoderOobleck
+
+The Oobleck variational autoencoder (VAE) model with KL loss was introduced in [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) and [Stable Audio Open](https://huggingface.co/papers/2407.14358) by Stability AI. The model is used in 🤗 Diffusers to encode audio waveforms into latents and to decode latent representations into audio waveforms.
+
+The abstract from the paper is:
+
+*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
+
+## AutoencoderOobleck
+
+[[autodoc]] AutoencoderOobleck
+    - decode
+    - encode
+    - all
+
+## OobleckDecoderOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
+
+## OobleckDecoderOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
+
+## AutoencoderOobleckOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.AutoencoderOobleckOutput
@@ -0,0 +1,37 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLCogVideoX
+
+The 3D variational autoencoder (VAE) model with KL loss used in [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLCogVideoX
+
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+```
+
+## AutoencoderKLCogVideoX
+
+[[autodoc]] AutoencoderKLCogVideoX
+    - decode
+    - encode
+    - all
+
+## AutoencoderKLOutput
+
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,30 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# CogVideoXTransformer3DModel
+
+A Diffusion Transformer model for 3D data from [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import CogVideoXTransformer3DModel
+
+vae = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+```
+
+## CogVideoXTransformer3DModel
+
+[[autodoc]] CogVideoXTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,46 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# SparseControlNetModel
+
+SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://arxiv.org/abs/2307.04725).
+
+ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
+
+The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
+
+The abstract from the paper is:
+
+*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).*
+
+## Example for loading SparseControlNetModel
+
+```python
+import torch
+from diffusers import SparseControlNetModel
+
+# fp32 variant in float16
+# 1. Scribble checkpoint
+controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-scribble", torch_dtype=torch.float16)
+
+# 2. RGB checkpoint
+controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-rgb", torch_dtype=torch.float16)
+
+# For loading fp16 variant, pass `variant="fp16"` as an additional parameter
+```
+
+## SparseControlNetModel
+
+[[autodoc]] SparseControlNetModel
+
+## SparseControlNetOutput
+
+[[autodoc]] models.controlnet_sparsectrl.SparseControlNetOutput
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FluxTransformer2DModel
+
+A Transformer model for image-like data from [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).
+
+## FluxTransformer2DModel
+
+[[autodoc]] FluxTransformer2DModel
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# StableAudioDiTModel
+
+A Transformer model for audio waveforms from [Stable Audio Open](https://huggingface.co/papers/2407.14358).
+
+## StableAudioDiTModel
+
+[[autodoc]] StableAudioDiTModel
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# StableCascadeUNet
+
+A UNet model from the [Stable Cascade pipeline](../pipelines/stable_cascade.md).
+
+## StableCascadeUNet
+
+[[autodoc]] models.unets.unet_stable_cascade.StableCascadeUNet
@@ -25,6 +25,9 @@ The abstract of the paper is the following:
 | Pipeline | Tasks | Demo
 |---|---|:---:|
 | [AnimateDiffPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff.py) | *Text-to-Video Generation with AnimateDiff* |
+| [AnimateDiffControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py) | *Controlled Video-to-Video Generation with AnimateDiff using ControlNet* |
+| [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
+| [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
 | [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |

 ## Available checkpoints
@@ -100,6 +103,266 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you

 </Tip>

+### AnimateDiffControlNetPipeline
+
+AnimateDiff can also be used with ControlNets ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide depth maps, the ControlNet model generates a video that'll preserve the spatial information from the depth maps. It is a more flexible and accurate way to control the video generation process.
+
+```python
+import torch
+from diffusers import AnimateDiffControlNetPipeline, AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
+from diffusers.utils import export_to_gif, load_video
+
+# Additionally, you will need a preprocess videos before they can be used with the ControlNet
+# HF maintains just the right package for it: `pip install controlnet_aux`
+from controlnet_aux.processor import ZoeDetector
+
+# Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
+# Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
+controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)
+
+# We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
+motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
+
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V5.1_noVAE",
+    motion_adapter=motion_adapter,
+    controlnet=controlnet,
+    vae=vae,
+).to(device="cuda", dtype=torch.float16)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
+pipe.set_adapters(["lcm-lora"], [0.8])
+
+depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
+video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
+conditioning_frames = []
+
+with pipe.progress_bar(total=len(video)) as progress_bar:
+    for frame in video:
+        conditioning_frames.append(depth_detector(frame))
+        progress_bar.update()
+
+prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
+negative_prompt = "bad quality, worst quality"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_frames=len(video),
+    num_inference_steps=10,
+    guidance_scale=2.0,
+    conditioning_frames=conditioning_frames,
+    generator=torch.Generator().manual_seed(42),
+).frames[0]
+
+export_to_gif(video, "animatediff_controlnet.gif", fps=8)
+```
+
+Here are some sample outputs:
+
+<table align="center">
+    <tr>
+      <th align="center">Source Video</th>
+      <th align="center">Output Video</th>
+    </tr>
+    <tr>
+        <td align="center">
+          raccoon playing a guitar
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" alt="racoon playing a guitar" />
+        </td>
+        <td align="center">
+          a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-controlnet-output.gif" alt="a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality" />
+        </td>
+    </tr>
+</table>
+
+### AnimateDiffSparseControlNetPipeline
+
+[SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
+
+The abstract from the paper is:
+
+*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).*
+
+SparseCtrl introduces the following checkpoints for controlled text-to-video generation:
+
+- [SparseCtrl Scribble](https://huggingface.co/guoyww/animatediff-sparsectrl-scribble)
+- [SparseCtrl RGB](https://huggingface.co/guoyww/animatediff-sparsectrl-rgb)
+
+#### Using SparseCtrl Scribble
+
+```python
+import torch
+
+from diffusers import AnimateDiffSparseControlNetPipeline
+from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import export_to_gif, load_image
+
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
+controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
+lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
+vae_id = "stabilityai/sd-vae-ft-mse"
+device = "cuda"
+
+motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
+controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
+vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
+scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id,
+    subfolder="scheduler",
+    beta_schedule="linear",
+    algorithm_type="dpmsolver++",
+    use_karras_sigmas=True,
+)
+pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
+    model_id,
+    motion_adapter=motion_adapter,
+    controlnet=controlnet,
+    vae=vae,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+).to(device)
+pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
+pipe.fuse_lora(lora_scale=1.0)
+
+prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
+negative_prompt = "low quality, worst quality, letterboxed"
+
+image_files = [
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png"
+]
+condition_frame_indices = [0, 8, 15]
+conditioning_frames = [load_image(img_file) for img_file in image_files]
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=25,
+    conditioning_frames=conditioning_frames,
+    controlnet_conditioning_scale=1.0,
+    controlnet_frame_indices=condition_frame_indices,
+    generator=torch.Generator().manual_seed(1337),
+).frames[0]
+export_to_gif(video, "output.gif")
+```
+
+Here are some sample outputs:
+
+<table align="center">
+    <tr>
+        <center>
+          <b>an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality</b>
+        </center>
+    </tr>
+    <tr>
+        <td>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png" alt="scribble-1" />
+          </center>
+        </td>
+        <td>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png" alt="scribble-2" />
+          </center>
+        </td>
+        <td>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png" alt="scribble-3" />
+          </center>
+        </td>
+    </tr>
+    <tr>
+        <td colspan=3>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-sparsectrl-scribble-results.gif" alt="an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality" />
+          </center>
+        </td>
+    </tr>
+</table>
+
+#### Using SparseCtrl RGB
+
+```python
+import torch
+
+from diffusers import AnimateDiffSparseControlNetPipeline
+from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import export_to_gif, load_image
+
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
+controlnet_id = "guoyww/animatediff-sparsectrl-rgb"
+lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
+vae_id = "stabilityai/sd-vae-ft-mse"
+device = "cuda"
+
+motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
+controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
+vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
+scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id,
+    subfolder="scheduler",
+    beta_schedule="linear",
+    algorithm_type="dpmsolver++",
+    use_karras_sigmas=True,
+)
+pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
+    model_id,
+    motion_adapter=motion_adapter,
+    controlnet=controlnet,
+    vae=vae,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+).to(device)
+pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
+
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-firework.png")
+
+video = pipe(
+    prompt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background",
+    negative_prompt="low quality, worst quality",
+    num_inference_steps=25,
+    conditioning_frames=image,
+    controlnet_frame_indices=[0],
+    controlnet_conditioning_scale=1.0,
+    generator=torch.Generator().manual_seed(42),
+).frames[0]
+export_to_gif(video, "output.gif")
+```
+
+Here are some sample outputs:
+
+<table align="center">
+    <tr>
+        <center>
+          <b>closeup face photo of man in black clothes, night city street, bokeh, fireworks in background</b>
+        </center>
+    </tr>
+    <tr>
+        <td>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-firework.png" alt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" />
+          </center>
+        </td>
+        <td>
+          <center>
+            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-sparsectrl-rgb-result.gif" alt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" />
+          </center>
+        </td>
+    </tr>
+</table>
+
 ### AnimateDiffSDXLPipeline

 AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
@@ -571,7 +834,6 @@ ckpt_path = "https://huggingface.co/Lightricks/LongAnimateDiff/blob/main/lt_long

 adapter = MotionAdapter.from_single_file(ckpt_path, torch_dtype=torch.float16)
 pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
-
 ```

 ## AnimateDiffPipeline
@@ -580,6 +842,18 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
  - all
  - __call__

+## AnimateDiffControlNetPipeline
+
+[[autodoc]] AnimateDiffControlNetPipeline
+  - all
+  - __call__
+
+## AnimateDiffSparseControlNetPipeline
+
+[[autodoc]] AnimateDiffSparseControlNetPipeline
+  - all
+  - __call__
+
 ## AnimateDiffSDXLPipeline

 [[autodoc]] AnimateDiffSDXLPipeline
@@ -18,7 +18,7 @@ It was developed by the Fal team and more details about it can be found in [this

 <Tip>

-AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. 
+AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details.

 </Tip>

@@ -0,0 +1,92 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+-->
+
+# CogVideoX
+
+[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.
+
+The abstract from the paper is:
+
+*We introduce CogVideoX, a large-scale diffusion transformer model designed for generating videos based on text prompts. To efficently model video data, we propose to levearge a 3D Variational Autoencoder (VAE) to compresses videos along both spatial and temporal dimensions. To improve the text-video alignment, we propose an expert transformer with the expert adaptive LayerNorm to facilitate the deep fusion between the two modalities. By employing a progressive training technique, CogVideoX is adept at producing coherent, long-duration videos characterized by significant motion. In addition, we develop an effectively text-video data processing pipeline that includes various data preprocessing strategies and a video captioning method. It significantly helps enhance the performance of CogVideoX, improving both generation quality and semantic alignment. Results show that CogVideoX demonstrates state-of-the-art performance across both multiple machine metrics and human evaluations. The model weight of CogVideoX-2B is publicly available at https://github.com/THUDM/CogVideo.*
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
+
+There are two models available that can be used with the CogVideoX pipeline:
+- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b)
+- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b)
+
+## Inference
+
+Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
+
+First, load the pipeline:
+
+```python
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b").to("cuda")
+```
+
+Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
+
+```python
+pipe.transformer.to(memory_format=torch.channels_last)
+```
+
+Finally, compile the components and run inference:
+
+```python
+pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
+
+# CogVideoX works well with long and well-described prompts
+prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+```
+
+The [benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
+
+```
+Without torch.compile(): Average inference time: 96.89 seconds.
+With torch.compile(): Average inference time: 76.27 seconds.
+```
+
+### Memory optimization
+
+CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
+
+- `pipe.enable_model_cpu_offload()`:
+  - Without enabling cpu offloading, memory usage is `33 GB`
+  - With enabling cpu offloading, memory usage is `19 GB`
+- `pipe.vae.enable_tiling()`:
+  - With enabling cpu offloading and tiling, memory usage is `11 GB`
+- `pipe.vae.enable_slicing()`
+
+## CogVideoXPipeline
+
+[[autodoc]] CogVideoXPipeline
+  - all
+  - __call__
+
+## CogVideoXPipelineOutput
+
+[[autodoc]] pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput
@@ -0,0 +1,165 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Flux
+
+Flux is a series of text-to-image generation models based on diffusion transformers. To know more about Flux, check out the original [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/) by the creators of Flux, Black Forest Labs.
+
+Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux).
+
+<Tip>
+
+Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.  For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
+
+</Tip>
+
+Flux comes in two variants:
+
+* Timestep-distilled (`black-forest-labs/FLUX.1-schnell`)
+* Guidance-distilled (`black-forest-labs/FLUX.1-dev`)
+
+Both checkpoints have slightly difference usage which we detail below.
+
+### Timestep-distilled
+
+* `max_sequence_length` cannot be more than 256.
+* `guidance_scale` needs to be 0.
+* As this is a timestep-distilled model, it benefits from fewer sampling steps.
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=0.,
+    height=768,
+    width=1360,
+    num_inference_steps=4,
+    max_sequence_length=256,
+).images[0]
+out.save("image.png")
+```
+
+### Guidance-distilled
+
+* The guidance-distilled variant takes about 50 sampling steps for good-quality generation.
+* It doesn't have any limitations around the `max_sequence_length`.
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()
+
+prompt = "a tiny astronaut hatching from an egg on the moon"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=3.5,
+    height=768,
+    width=1360,
+    num_inference_steps=50,
+).images[0]
+out.save("image.png")
+```
+
+## Running FP16 inference
+Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
+
+FP16 inference code:
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) # can replace schnell with dev
+# to run on low vram GPUs (i.e. between 4 and 32 GB VRAM)
+pipe.enable_sequential_cpu_offload()
+pipe.vae.enable_slicing()
+pipe.vae.enable_tiling()
+
+pipe.to(torch.float16) # casting here instead of in the pipeline constructor because doing so in the constructor loads all models into CPU memory at once
+
+prompt = "A cat holding a sign that says hello world"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=0.,
+    height=768,
+    width=1360,
+    num_inference_steps=4,
+    max_sequence_length=256,
+).images[0]
+out.save("image.png")
+```
+
+## Single File Loading for the `FluxTransformer2DModel`
+
+The `FluxTransformer2DModel` supports loading checkpoints in the original format shipped by Black Forest Labs. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
+
+<Tip>
+`FP8` inference can be brittle depending on the GPU type, CUDA version, and `torch` version that you are using. It is recommended that you use the `optimum-quanto` library in order to run FP8 inference on your machine.
+</Tip>
+
+The following example demonstrates how to run Flux with less than 16GB of VRAM.
+
+First install `optimum-quanto`
+
+```shell
+pip install optimum-quanto
+```
+
+Then run the following example
+
+```python
+import torch
+from diffusers import FluxTransformer2DModel, FluxPipeline
+from transformers import T5EncoderModel, CLIPTextModel
+from optimum.quanto import freeze, qfloat8, quantize
+
+bfl_repo = "black-forest-labs/FLUX.1-dev"
+dtype = torch.bfloat16
+
+transformer = FluxTransformer2DModel.from_single_file("https://huggingface.co/Kijai/flux-fp8/blob/main/flux1-dev-fp8.safetensors", torch_dtype=dtype)
+quantize(transformer, weights=qfloat8)
+freeze(transformer)
+
+text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
+quantize(text_encoder_2, weights=qfloat8)
+freeze(text_encoder_2)
+
+pipe = FluxPipeline.from_pretrained(bfl_repo, transformer=None, text_encoder_2=None, torch_dtype=dtype)
+pipe.transformer = transformer
+pipe.text_encoder_2 = text_encoder_2
+
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+image = pipe(
+    prompt,
+    guidance_scale=3.5,
+    output_type="pil",
+    num_inference_steps=20,
+    generator=torch.Generator("cpu").manual_seed(0)
+).images[0]
+
+image.save("flux-fp8-dev.png")
+```
+
+## FluxPipeline
+
+[[autodoc]] FluxPipeline
+	- all
+	- __call__
@@ -41,6 +41,64 @@ image = pipe(
 image.save("kolors_sample.png")
 ```

+### IP Adapter
+
+Kolors needs a different IP Adapter to work, and it uses [Openai-CLIP-336](https://huggingface.co/openai/clip-vit-large-patch14-336) as an image encoder.
+
+<Tip>
+
+Using an IP Adapter with Kolors requires more than 24GB of VRAM. To use it, we recommend using [`~DiffusionPipeline.enable_model_cpu_offload`] on consumer GPUs.
+
+</Tip>
+
+<Tip>
+
+While Kolors is integrated in Diffusers, you need to load the image encoder from a revision to use the safetensor files. You can still use the main branch of the original repository if you're comfortable loading pickle checkpoints.
+
+</Tip>
+
+```python
+import torch
+from transformers import CLIPVisionModelWithProjection
+
+from diffusers import DPMSolverMultistepScheduler, KolorsPipeline
+from diffusers.utils import load_image
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "Kwai-Kolors/Kolors-IP-Adapter-Plus",
+    subfolder="image_encoder",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+    revision="refs/pr/4",
+)
+
+pipe = KolorsPipeline.from_pretrained(
+    "Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
+).to("cuda")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
+
+pipe.load_ip_adapter(
+    "Kwai-Kolors/Kolors-IP-Adapter-Plus",
+    subfolder="",
+    weight_name="ip_adapter_plus_general.safetensors",
+    revision="refs/pr/4",
+    image_encoder_folder=None,
+)
+pipe.enable_model_cpu_offload()
+
+ipa_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/cat_square.png")
+
+image = pipe(
+    prompt="best quality, high quality",
+    negative_prompt="",
+    guidance_scale=6.5,
+    num_inference_steps=25,
+    ip_adapter_image=ipa_image,
+).images[0]
+
+image.save("kolors_ipa_sample.png")
+```
+
 ## KolorsPipeline

 [[autodoc]] KolorsPipeline
@@ -24,6 +24,8 @@ The abstract from the paper is:

 **Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).

+This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
+
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -43,6 +43,8 @@ Lumina-T2X has the following components:
 * It uses a Flow-based Large Diffusion Transformer as the backbone
 * It supports different any modalities with one backbone and corresponding encoder, decoder.

+This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
+
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -57,7 +59,7 @@ First, load the pipeline:

 ```python
 from diffusers import LuminaText2ImgPipeline
-import torch 
+import torch

 pipeline = LuminaText2ImgPipeline.from_pretrained(
 	"Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
@@ -85,4 +87,4 @@ image = pipeline(prompt="Upper body of a young woman in a Victorian-era outfit w
 [[autodoc]] LuminaText2ImgPipeline
 	- all
 	- __call__
-	
+
@@ -71,6 +71,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Semantic Guidance](semantic_stable_diffusion) | text2image |
 | [Shap-E](shap_e) | text-to-3D, image-to-3D |
 | [Spectrogram Diffusion](spectrogram_diffusion) |  |
+| [Stable Audio](stable_audio) | text2audio |
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
 | [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
@@ -20,6 +20,34 @@ The abstract from the paper is:

 *Recent studies have demonstrated that diffusion models are capable of generating high-quality samples, but their quality heavily depends on sampling guidance techniques, such as classifier guidance (CG) and classifier-free guidance (CFG). These techniques are often not applicable in unconditional generation or in various downstream tasks such as image restoration. In this paper, we propose a novel sampling guidance, called Perturbed-Attention Guidance (PAG), which improves diffusion sample quality across both unconditional and conditional settings, achieving this without requiring additional training or the integration of external modules. PAG is designed to progressively enhance the structure of samples throughout the denoising process. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, by considering the self-attention mechanisms' ability to capture structural information, and guiding the denoising process away from these degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves sample quality in conditional and even unconditional scenarios. Moreover, PAG significantly improves the baseline performance in various downstream tasks where existing guidances such as CG or CFG cannot be fully utilized, including ControlNet with empty prompts and image restoration such as inpainting and deblurring.*

+PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers. 
+
+- Full identifier as a normal string: `down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor`
+- Full identifier as a RegEx: `down_blocks.2.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor`
+- Partial identifier as a RegEx: `down_blocks.2`, or `attn1`
+- List of identifiers (can be combo of strings and ReGex): `["blocks.1", "blocks.(14|20)", r"down_blocks\.(2,3)"]`
+
+<Tip warning={true}>
+
+Since RegEx is supported as a way for matching layer identifiers, it is crucial to use it correctly otherwise there might be unexpected behaviour. The recommended way to use PAG is by specifying layers as `blocks.{layer_index}` and `blocks.({layer_index_1|layer_index_2|...})`. Using it in any other way, while doable, may bypass our basic validation checks and give you unexpected results.
+
+</Tip>
+
+## AnimateDiffPAGPipeline
+[[autodoc]] AnimateDiffPAGPipeline
+  - all
+  - __call__
+
+## HunyuanDiTPAGPipeline
+[[autodoc]] HunyuanDiTPAGPipeline
+  - all
+  - __call__
+
+## KolorsPAGPipeline
+[[autodoc]] KolorsPAGPipeline
+  - all
+  - __call__  
+
 ## StableDiffusionPAGPipeline
 [[autodoc]] StableDiffusionPAGPipeline
 	- all
@@ -49,3 +77,15 @@ The abstract from the paper is:
 [[autodoc]] StableDiffusionXLControlNetPAGPipeline
 	- all
 	- __call__
+
+
+## StableDiffusion3PAGPipeline
+[[autodoc]] StableDiffusion3PAGPipeline
+	- all
+	- __call__
+
+
+## PixArtSigmaPAGPipeline
+[[autodoc]] PixArtSigmaPAGPipeline
+	- all
+	- __call__
@@ -0,0 +1,42 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable Audio
+
+Stable Audio was proposed in [Stable Audio Open](https://arxiv.org/abs/2407.14358) by Zach Evans et al. . it takes a text prompt as input and predicts the corresponding sound or music sample.
+
+Stable Audio Open generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.
+
+Stable Audio is trained on a corpus of around 48k audio recordings, where around 47k are from Freesound and the rest are from the Free Music Archive (FMA). All audio files are licensed under CC0, CC BY, or CC Sampling+. This data is used to train the autoencoder and the DiT.
+
+The abstract of the paper is the following:
+*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
+
+This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool).
+
+## Tips
+
+When constructing a prompt, keep in mind:
+
+* Descriptive prompt inputs work best; use adjectives to describe the sound (for example, "high quality" or "clear") and make the prompt context specific where possible (e.g. "melodic techno with a fast beat and synths" works better than "techno").
+* Using a *negative prompt* can significantly improve the quality of the generated audio. Try using a negative prompt of "low quality, average quality".
+
+During inference:
+
+* The _quality_ of the generated audio sample can be controlled by the `num_inference_steps` argument; higher steps give higher quality audio at the expense of slower inference.
+* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1 to enable. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
+
+
+## StableAudioPipeline
+[[autodoc]] StableAudioPipeline
+	- all
+	- __call__
@@ -0,0 +1,24 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# CosineDPMSolverMultistepScheduler
+
+The [`CosineDPMSolverMultistepScheduler`] is a variant of [`DPMSolverMultistepScheduler`] with cosine schedule, proposed by Nichol and Dhariwal (2021).
+It is being used in the [Stable Audio Open](https://arxiv.org/abs/2407.14358) paper and the [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool) codebase.
+
+This scheduler was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe).
+
+## CosineDPMSolverMultistepScheduler
+[[autodoc]] CosineDPMSolverMultistepScheduler
+
+## SchedulerOutput
+[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
@@ -0,0 +1,78 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Community Projects
+
+Welcome to Community Projects. This space is dedicated to showcasing the incredible work and innovative applications created by our vibrant community using the `diffusers` library.
+
+This section aims to:
+
+- Highlight diverse and inspiring projects built with `diffusers`
+- Foster knowledge sharing within our community
+- Provide real-world examples of how `diffusers` can be leveraged
+
+Happy exploring, and thank you for being part of the Diffusers community!
+
+<table>
+    <tr>
+        <th>Project Name</th>
+        <th>Description</th>
+    </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/carson-katri/dream-textures"> dream-textures </a></td>
+    <td>Stable Diffusion built-in to Blender</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/megvii-research/HiDiffusion"> HiDiffusion </a></td>
+    <td>Increases the resolution and speed of your diffusion model by only adding a single line of code</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/lllyasviel/IC-Light"> IC-Light </a></td>
+    <td>IC-Light is a project to manipulate the illumination of images</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/InstantID/InstantID"> InstantID </a></td>
+    <td>InstantID : Zero-shot Identity-Preserving Generation in Seconds</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/Sanster/IOPaint"> IOPaint </a></td>
+    <td>Image inpainting tool powered by SOTA AI Model. Remove any unwanted object, defect, people from your pictures or erase and replace(powered by stable diffusion) any thing on your pictures.</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/bmaltais/kohya_ss"> Kohya </a></td>
+    <td>Gradio GUI for Kohya's Stable Diffusion trainers</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/magic-research/magic-animate"> MagicAnimate </a></td>
+    <td>MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/levihsu/OOTDiffusion"> OOTDiffusion </a></td>
+    <td>Outfitting Fusion based Latent Diffusion for Controllable Virtual Try-on</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/vladmandic/automatic"> SD.Next </a></td>
+    <td>SD.Next: Advanced Implementation of Stable Diffusion and other Diffusion-based generative image models</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/ashawkey/stable-dreamfusion"> stable-dreamfusion </a></td>
+    <td>Text-to-3D & Image-to-3D & Mesh Exportation with NeRF + Diffusion</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/HVision-NKU/StoryDiffusion"> StoryDiffusion </a></td>
+    <td>StoryDiffusion can create a magic story by generating consistent images and videos.</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/cumulo-autumn/StreamDiffusion"> StreamDiffusion </a></td>
+    <td>A Pipeline-Level Solution for Real-Time Interactive Generation</td>
+  </tr>
+</table>
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [InstructPix2Pix](https://hf.co/papers/2211.09800) is a Stable Diffusion model trained to edit images from human-provided instructions. For example, your prompt can be "turn the clouds rainy" and the model will edit the input image accordingly. This model is conditioned on the text prompt (or editing instruction) and the input image.

-This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
+This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use case.

 Before running the script, make sure you install the library from source:

@@ -117,7 +117,7 @@ optimizer = optimizer_cls(
 )
 ```

-Next, the edited images and and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.
+Next, the edited images and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.

 ```py
 def preprocess_train(examples):
@@ -249,4 +249,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl

 Congratulations on training your own InstructPix2Pix model! 🥳 To learn more about the model, it may be helpful to:

- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
+- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
@@ -340,8 +340,8 @@ Now you can wrap all these components together in a training loop with 🤗 Acce
 ...                 loss = F.mse_loss(noise_pred, noise)
 ...                 accelerator.backward(loss)

-...             if (step + 1) % config.gradient_accumulation_steps == 0:
-...                 accelerator.clip_grad_norm_(model.parameters(), 1.0)
+...                 if accelerator.sync_gradients:
+...                     accelerator.clip_grad_norm_(model.parameters(), 1.0)
 ...                 optimizer.step()
 ...                 lr_scheduler.step()
 ...                 optimizer.zero_grad()
@@ -35,7 +35,7 @@ pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu
 ```

 > [!TIP]
-> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum. 
+> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum.
 > If you're interested in the full benchmarking code, take a look at [huggingface/diffusion-fast](https://github.com/huggingface/diffusion-fast).


@@ -168,7 +168,7 @@ Using SDPA attention and compiling both the UNet and VAE cuts the latency from 3
 </div>

 > [!TIP]
-> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial. 
+> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial.

 ### Prevent graph breaks

@@ -18,13 +18,13 @@ A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/s
 * Two text encoders
 * A UNet for denoising

-Usually, the text encoders and the denoiser are much larger compared to the VAE. 
+Usually, the text encoders and the denoiser are much larger compared to the VAE.

 As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*.

 When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed.

-The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library. 
+The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library.

 > [!TIP]
 > Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory.
@@ -43,7 +43,7 @@ unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
 The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:

 ```python
-from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline 
+from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline
 import torch

 unet = UNet2DConditionModel.from_pretrained(
@@ -57,14 +57,14 @@ image = pipeline("a cute dog running on the grass", num_inference_steps=30).imag
 image.save("dog.png")
 ```

-If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you: 
+If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you:

 ```diff
 - pipeline.to("cuda")
 + pipeline.enable_model_cpu_offload()
 ```

-In general, we recommend sharding when a checkpoint is more than 5GB (in fp32). 
+In general, we recommend sharding when a checkpoint is more than 5GB (in fp32).

 ## Device placement

@@ -34,7 +34,7 @@ pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
 pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
 ```

-Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
+Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.

 ```python
 pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
@@ -191,7 +191,7 @@ image

 ## Manage active adapters

-You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
+You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:

 ```py
 active_adapters = pipe.get_active_adapters()
@@ -199,7 +199,7 @@ active_adapters
 ["toy", "pixel"]
 ```

-You can also get the active adapters of each pipeline component with [`~diffusers.loaders.LoraLoaderMixin.get_list_adapters`]:
+You can also get the active adapters of each pipeline component with [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_list_adapters`]:

 ```py
 list_adapters_component_wise = pipe.get_list_adapters()
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # Pipeline callbacks

-The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
+The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!

 > [!TIP]
 > 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
@@ -75,7 +75,7 @@ out.images[0].save("official_callback.png")
    <figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
  </div>
  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a sports car at the road with cfg callback" />
    <figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
  </div>
 </div>
@@ -256,7 +256,7 @@ make_image_grid([init_image, mask_image, output], rows=1, cols=3)

 ## Guess mode

-[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do it's best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).
+[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).

 Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest `DownBlock` corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the `MidBlock` output becomes 1.0.

@@ -289,9 +289,9 @@ scheduler = DPMSolverMultistepScheduler.from_pretrained(pipe_id, subfolder="sche
 3. Load an image processor:

 ```python
-from transformers import CLIPFeatureExtractor
+from transformers import CLIPImageProcessor

-feature_extractor = CLIPFeatureExtractor.from_pretrained(pipe_id, subfolder="feature_extractor")
+feature_extractor = CLIPImageProcessor.from_pretrained(pipe_id, subfolder="feature_extractor")
 ```

 <Tip warning={true}>
@@ -64,7 +64,7 @@ image
 </hfoption>
 <hfoption id="LCM-LoRA">

-To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
+To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.

 A couple of notes to keep in mind when using LCM-LoRAs are:

@@ -156,7 +156,7 @@ image
 </hfoption>
 <hfoption id="LCM-LoRA">

-To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
+To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.

 > [!TIP]
 > Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
@@ -207,7 +207,7 @@ image

 ## Inpainting

-To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.
+To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.

 ```py
 import torch
@@ -262,7 +262,7 @@ LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and Animat
 <hfoptions id="lcm-lora">
 <hfoption id="LCM">

-Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.
+Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.

 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -294,7 +294,7 @@ image
 </hfoption>
 <hfoption id="LCM-LoRA">

-Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.
+Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.

 ```py
 import torch
@@ -389,7 +389,7 @@ make_image_grid([canny_image, image], rows=1, cols=2)
 </hfoption>
 <hfoption id="LCM-LoRA">

-Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.
+Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.

 > [!TIP]
 > Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
@@ -525,7 +525,7 @@ image = pipe(
 </hfoption>
 <hfoption id="LCM-LoRA">

-Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.
+Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.

 ```py
 import torch
@@ -212,14 +212,14 @@ TCD-LoRA is very versatile, and it can be combined with other adapter types like
 import torch
 import numpy as np
 from PIL import Image
-from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from transformers import DPTImageProcessor, DPTForDepthEstimation
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
 from scheduling_tcd import TCDScheduler

 device = "cuda"
 depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
-feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")

 def get_depth_map(image):
    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
@@ -116,7 +116,7 @@ import torch
 pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
 ```

-Then use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository:
+Then use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository:

 ```py
 pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors")
@@ -129,7 +129,7 @@ image
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_lora.png" />
 </div>

-The [`~loaders.LoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where:
+The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where:

 - the LoRA weights don't have separate identifiers for the UNet and text encoder
 - the LoRA weights have separate identifiers for the UNet and text encoder
@@ -153,7 +153,7 @@ image
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
 </div>

-To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
+To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:

 ```py
 pipeline.unload_lora_weights()
@@ -161,9 +161,9 @@ pipeline.unload_lora_weights()

 ### Adjust LoRA weight scale

-For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
+For both [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.

-For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.LoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
+For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
 ```python
 pipe = ... # create pipeline
 pipe.load_lora_weights(..., adapter_name="my_adapter")
@@ -186,7 +186,7 @@ This also works with multiple adapters - see [this guide](https://huggingface.co

 <Tip warning={true}>

-Currently, [`~loaders.LoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
+Currently, [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.

 </Tip>

@@ -203,7 +203,7 @@ To load a Kohya LoRA, let's download the [Blueprintify SD XL 1.0](https://civita
 !wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors
 ```

-Load the LoRA checkpoint with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter:
+Load the LoRA checkpoint with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter:

 ```py
 from diffusers import AutoPipelineForText2Image
@@ -227,7 +227,7 @@ image
 Some limitations of using Kohya LoRAs with 🤗 Diffusers include:

 - Images may not look like those generated by UIs - like ComfyUI - for multiple reasons, which are explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736).
- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.LoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported.
+- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported.

 </Tip>

@@ -14,9 +14,9 @@ specific language governing permissions and limitations under the License.

 It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality.

-This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.LoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.
+This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.

-For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.
+For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.

 ```py
 from diffusers import DiffusionPipeline
@@ -182,9 +182,9 @@ image

 ## fuse_lora

-Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.LoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.
+Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.

-You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
+You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.

 For example, if you have a base model and adapters loaded and set as active with the following adapter weights:

@@ -199,13 +199,13 @@ pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_
 pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
 ```

-Fuse these LoRAs into the UNet with the [`~loaders.LoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.LoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
+Fuse these LoRAs into the UNet with the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.

 ```py
 pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
 ```

-Then you should use [`~loaders.LoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub.
+Then you should use [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub.

 ```py
 pipeline.unload_lora_weights()
@@ -226,7 +226,7 @@ image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai"
 image
 ```

-You can call [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.
+You can call [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.

 ```py
 pipeline.unfuse_lora()
@@ -74,7 +74,7 @@ pipeline = StableDiffusionPipeline.from_single_file(

 [LoRA](https://hf.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a lightweight adapter that is fast and easy to train, making them especially popular for generating images in a certain way or style. These adapters are commonly stored in a safetensors file, and are widely popular on model sharing platforms like [civitai](https://civitai.com/).

-LoRAs are loaded into a base model with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method.
+LoRAs are loaded into a base model with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method.

 ```py
 from diffusers import StableDiffusionXLPipeline
@@ -22,7 +22,7 @@ This guide will show you how to use PAG for various tasks and use cases.
 You can apply PAG to the [`StableDiffusionXLPipeline`] for tasks such as text-to-image, image-to-image, and inpainting. To enable PAG for a specific task, load the pipeline using the [AutoPipeline](../api/pipelines/auto_pipeline) API with the `enable_pag=True` flag and the `pag_applied_layers` argument.

 > [!TIP]
-> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!
+> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines and [`PixArtSigmaPAGPipeline`]. But feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!

 <hfoptions id="tasks">
 <hfoption id="Text-to-image">
@@ -130,10 +130,10 @@ prompt = "a dog catching a frisbee in the jungle"

 generator = torch.Generator(device="cpu").manual_seed(0)
 image = pipeline(
-    prompt, 
-    image=init_image, 
-    strength=0.8, 
-    guidance_scale=guidance_scale, 
+    prompt,
+    image=init_image,
+    strength=0.8,
+    guidance_scale=guidance_scale,
    pag_scale=pag_scale,
    generator=generator).images[0]
 ```
@@ -161,14 +161,14 @@ pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-
 pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
 ```

-This still works when your pipeline has a different task: 
+This still works when your pipeline has a different task:

 ```py
 pipeline_t2i = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
 pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_t2i, enable_pag=True)
 ```

-Let's generate an image! 
+Let's generate an image!

 ```py
 img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
@@ -258,7 +258,7 @@ for pag_scale in [0.0, 3.0]:
  </div>
 </div>

-## PAG with IP-Adapter 
+## PAG with IP-Adapter

 [IP-Adapter](https://hf.co/papers/2308.06721) is a popular model that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. You can enable PAG on a pipeline with IP-Adapter loaded.

@@ -317,7 +317,7 @@ PAG reduces artifacts and improves the overall compposition.
 </div>


-## Configure parameters 
+## Configure parameters

 ### pag_applied_layers

@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# 철학 [[philosophy]] 
+# 철학 [[philosophy]]

 🧨 Diffusers는 다양한 모달리티에서 **최신의** 사전 훈련된 diffusion 모델을 제공합니다.
 그 목적은 추론과 훈련을 위한 **모듈식 툴박스**로 사용되는 것입니다.
@@ -307,7 +307,7 @@ print(pipeline)

 위의 코드 출력 결과를 확인해보면, `pipeline`은 [`StableDiffusionPipeline`]의 인스턴스이며, 다음과 같이 총 7개의 컴포넌트로 구성된다는 것을 알 수 있습니다.

- `"feature_extractor"`: [`~transformers.CLIPFeatureExtractor`]의 인스턴스
+- `"feature_extractor"`: [`~transformers.CLIPImageProcessor`]의 인스턴스
 - `"safety_checker"`: 유해한 컨텐츠를 스크리닝하기 위한 [컴포넌트](https://github.com/huggingface/diffusers/blob/e55687e1e15407f60f32242027b7bb8170e58266/src/diffusers/pipelines/stable_diffusion/safety_checker.py#L32)
 - `"scheduler"`: [`PNDMScheduler`]의 인스턴스
 - `"text_encoder"`: [`~transformers.CLIPTextModel`]의 인스턴스
@@ -127,7 +127,7 @@ image = pipeline(prompt, num_inference_steps=50).images[0]

 [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111)은 Stable Diffusion을 위해 널리 사용되는 웹 UI로, [Civitai](https://civitai.com/) 와 같은 모델 공유 플랫폼을 지원합니다. 특히 LoRA 기법으로 학습된 모델은 학습 속도가 빠르고 완전히 파인튜닝된 모델보다 파일 크기가 훨씬 작기 때문에 인기가 높습니다.

-🤗 Diffusers는 [`~loaders.LoraLoaderMixin.load_lora_weights`]:를 사용하여 A1111 LoRA 체크포인트 불러오기를 지원합니다:
+🤗 Diffusers는 [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]:를 사용하여 A1111 LoRA 체크포인트 불러오기를 지원합니다:

 ```py
 from diffusers import DiffusionPipeline, UniPCMultistepScheduler
@@ -52,7 +52,7 @@ pipeline = pipeline.to("cuda")

 Text-to-image의 경우 텍스트 프롬프트를 전달합니다. 기본적으로 SDXL Turbo는 512x512 이미지를 생성하며, 이 해상도에서 최상의 결과를 제공합니다. `height` 및 `width` 매개 변수를 768x768 또는 1024x1024로 설정할 수 있지만 이 경우 품질 저하를 예상할 수 있습니다.

-모델이 `guidance_scale` 없이 학습되었으므로 이를 0.0으로 설정해 비활성화해야 합니다. 단일 추론 스텝만으로도 고품질 이미지를 생성할 수 있습니다. 
+모델이 `guidance_scale` 없이 학습되었으므로 이를 0.0으로 설정해 비활성화해야 합니다. 단일 추론 스텝만으로도 고품질 이미지를 생성할 수 있습니다.
 스텝 수를 2, 3 또는 4로 늘리면 이미지 품질이 향상됩니다.

 ```py
@@ -74,7 +74,7 @@ image

 ## Image-to-image

-Image-to-image 생성의 경우 `num_inference_steps * strength`가 1보다 크거나 같은지 확인하세요. 
+Image-to-image 생성의 경우 `num_inference_steps * strength`가 1보다 크거나 같은지 확인하세요.
 Image-to-image 파이프라인은 아래 예제에서 `0.5 * 2.0 = 1` 스텝과 같이 `int(num_inference_steps * strength)` 스텝으로 실행됩니다.

 ```py
@@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License.
 시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요:

 ```py
-!pip install -q -U diffusers transformers accelerate 
+!pip install -q -U diffusers transformers accelerate
 ```

 이 모델에는 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid)와 [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 두 가지 종류가 있습니다. SVD 체크포인트는 14개의 프레임을 생성하도록 학습되었고, SVD-XT 체크포인트는 25개의 프레임을 생성하도록 파인튜닝되었습니다.
@@ -24,7 +24,7 @@ import PIL
 from PIL import Image

 from diffusers import StableDiffusionPipeline
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer


 def image_grid(imgs, rows, cols):
@@ -57,7 +57,7 @@ from diffusers import (
    StableDiffusionPipeline,
    UNet2DConditionModel,
 )
-from diffusers.loaders import LoraLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import compute_snr
 from diffusers.utils import (
@@ -71,7 +71,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.30.0")

 logger = get_logger(__name__)

@@ -1318,11 +1318,11 @@ def main(args):
            else:
                raise ValueError(f"unexpected save model: {model.__class__}")

-        lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-        LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
+        StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)

        text_encoder_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder." in k}
-        LoraLoaderMixin.load_lora_into_text_encoder(
+        StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder(
            text_encoder_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_one_
        )

@@ -60,7 +60,7 @@ from diffusers import (
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
 )
-from diffusers.loaders import LoraLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
 from diffusers.utils import (
@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.30.0")

 logger = get_logger(__name__)

@@ -1646,7 +1646,7 @@ def main(args):
            else:
                raise ValueError(f"unexpected save model: {model.__class__}")

-        lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
+        lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)

        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
@@ -41,7 +41,7 @@ from transformers import (

 import diffusers.optimization
 from diffusers import AmusedPipeline, AmusedScheduler, EMAModel, UVit2DModel, VQModel
-from diffusers.loaders import LoraLoaderMixin
+from diffusers.loaders import AmusedLoraLoaderMixin
 from diffusers.utils import is_wandb_available


@@ -532,7 +532,7 @@ def main(args):
                weights.pop()

            if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None:
-                LoraLoaderMixin.save_lora_weights(
+                AmusedLoraLoaderMixin.save_lora_weights(
                    output_dir,
                    transformer_lora_layers=transformer_lora_layers_to_save,
                    text_encoder_lora_layers=text_encoder_lora_layers_to_save,
@@ -566,11 +566,11 @@ def main(args):
                raise ValueError(f"unexpected save model: {model.__class__}")

        if transformer is not None or text_encoder_ is not None:
-            lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-            LoraLoaderMixin.load_lora_into_text_encoder(
+            lora_state_dict, network_alphas = AmusedLoraLoaderMixin.lora_state_dict(input_dir)
+            AmusedLoraLoaderMixin.load_lora_into_text_encoder(
                lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_
            )
-            LoraLoaderMixin.load_lora_into_transformer(
+            AmusedLoraLoaderMixin.load_lora_into_transformer(
                lora_state_dict, network_alphas=network_alphas, transformer=transformer
            )

@@ -1435,9 +1435,9 @@ import requests
 import torch
 from diffusers import DiffusionPipeline
 from PIL import Image
-from transformers import CLIPFeatureExtractor, CLIPModel
+from transformers import CLIPImageProcessor, CLIPModel

-feature_extractor = CLIPFeatureExtractor.from_pretrained(
+feature_extractor = CLIPImageProcessor.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 )
 clip_model = CLIPModel.from_pretrained(
@@ -1487,17 +1487,16 @@ NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
 ```python
 import torch
 from diffusers import DDIMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.pipelines import DiffusionPipeline

 # Use the DDIMScheduler scheduler here instead
-scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                            subfolder="scheduler")
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler")

-pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                                custom_pipeline="stable_diffusion_tensorrt_txt2img",
-                                                variant='fp16',
-                                                torch_dtype=torch.float16,
-                                                scheduler=scheduler,)
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+    custom_pipeline="stable_diffusion_tensorrt_txt2img",
+    variant='fp16',
+    torch_dtype=torch.float16,
+    scheduler=scheduler,)

 # re-use cached folder to save ONNX models and TensorRT Engines
 pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
@@ -1641,18 +1640,18 @@ from io import BytesIO
 from PIL import Image
 import torch
 from diffusers import DDIMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline
+from diffusers import DiffusionPipeline

 # Use the DDIMScheduler scheduler here instead
 scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
                                            subfolder="scheduler")


-pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                                custom_pipeline="stable_diffusion_tensorrt_img2img",
-                                                variant='fp16',
-                                                torch_dtype=torch.float16,
-                                                scheduler=scheduler,)
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                            custom_pipeline="stable_diffusion_tensorrt_img2img",
+                                            variant='fp16',
+                                            torch_dtype=torch.float16,
+                                            scheduler=scheduler,)

 # re-use cached folder to save ONNX models and TensorRT Engines
 pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
@@ -2123,7 +2122,7 @@ import torch
 import open_clip
 from open_clip import SimpleTokenizer
 from diffusers import DiffusionPipeline
-from transformers import CLIPFeatureExtractor, CLIPModel
+from transformers import CLIPImageProcessor, CLIPModel


 def download_image(url):
@@ -2131,7 +2130,7 @@ def download_image(url):
    return PIL.Image.open(BytesIO(response.content)).convert("RGB")

 # Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
+feature_extractor = CLIPImageProcessor.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 )
 clip_model = CLIPModel.from_pretrained(
@@ -2231,12 +2230,12 @@ from io import BytesIO
 from PIL import Image
 import torch
 from diffusers import PNDMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+from diffusers.pipelines import DiffusionPipeline

 # Use the PNDMScheduler scheduler here instead
 scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler")

-pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
    custom_pipeline="stable_diffusion_tensorrt_inpaint",
    variant='fp16',
    torch_dtype=torch.float16,
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn import functional as F
 from torchvision import transforms
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -86,7 +86,7 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMi
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        coca_model=None,
        coca_tokenizer=None,
        coca_transform=None,
@@ -7,7 +7,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 from torchvision import transforms
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -32,9 +32,9 @@ EXAMPLE_DOC_STRING = """
        import torch
        from diffusers import DiffusionPipeline
        from PIL import Image
-        from transformers import CLIPFeatureExtractor, CLIPModel
+        from transformers import CLIPImageProcessor, CLIPModel

-        feature_extractor = CLIPFeatureExtractor.from_pretrained(
+        feature_extractor = CLIPImageProcessor.from_pretrained(
            "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
        )
        clip_model = CLIPModel.from_pretrained(
@@ -139,7 +139,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -26,7 +26,7 @@ from gmflow.gmflow import GMFlow
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import AttnProcessor2_0
 from diffusers.models.lora import adjust_lora_scale_text_encoder
@@ -1252,8 +1252,8 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -1456,7 +1456,7 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -1588,7 +1588,7 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -2436,7 +2436,7 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -7,7 +7,7 @@ from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor

 from diffusers import DiffusionPipeline
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import LoraLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
@@ -194,7 +194,7 @@ def retrieve_timesteps(
    return timesteps, num_inference_steps


-class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, LoraLoaderMixin):
+class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionLoraLoaderMixin):
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -290,7 +290,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -424,7 +424,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -21,7 +21,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -53,7 +53,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):


 class InstaFlowPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
 ):
    r"""
    Pipeline for text-to-image generation using Rectified Flow and Euler discretization.
@@ -64,8 +68,8 @@ class InstaFlowPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files

    Args:
@@ -251,7 +255,7 @@ class InstaFlowPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -24,7 +24,12 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV

 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention_processor import (
    AttnProcessor,
@@ -130,7 +135,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
    DiffusionPipeline,
    StableDiffusionMixin,
    TextualInversionLoaderMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    IPAdapterMixin,
    FromSingleFileMixin,
 ):
@@ -142,8 +147,8 @@ class IPAdapterFaceIDStableDiffusionPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -518,7 +523,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -650,7 +655,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -395,8 +395,8 @@ class StableDiffusionHighResFixPipeline(StableDiffusionPipeline):

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -6,7 +6,7 @@ import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -190,7 +190,11 @@ def slerp(


 class LatentConsistencyModelWalkPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
 ):
    r"""
    Pipeline for text-to-image generation using a latent consistency model.
@@ -200,8 +204,8 @@ class LatentConsistencyModelWalkPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files

    Args:
@@ -317,7 +321,7 @@ class LatentConsistencyModelWalkPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -449,7 +453,7 @@ class LatentConsistencyModelWalkPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -29,7 +29,12 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV

 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention import Attention, GatedSelfAttentionDense
 from diffusers.models.attention_processor import AttnProcessor2_0
@@ -271,7 +276,7 @@ class LLMGroundedDiffusionPipeline(
    DiffusionPipeline,
    StableDiffusionMixin,
    TextualInversionLoaderMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    IPAdapterMixin,
    FromSingleFileMixin,
 ):
@@ -1263,7 +1268,7 @@ class LLMGroundedDiffusionPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -1397,7 +1402,7 @@ class LLMGroundedDiffusionPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -11,15 +11,19 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
    deprecate,
    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -199,6 +203,7 @@ def get_unweighted_text_embeddings(
    text_input: torch.Tensor,
    chunk_length: int,
    no_boseos_middle: Optional[bool] = True,
+    clip_skip: Optional[int] = None,
 ):
    """
    When the length of tokens is a multiple of the capacity of the text encoder,
@@ -214,7 +219,20 @@ def get_unweighted_text_embeddings(
            # cover the head and the tail by the starting and the ending tokens
            text_input_chunk[:, 0] = text_input[0, 0]
            text_input_chunk[:, -1] = text_input[0, -1]
-            text_embedding = pipe.text_encoder(text_input_chunk)[0]
+            if clip_skip is None:
+                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device))
+                text_embedding = prompt_embeds[0]
+            else:
+                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device), output_hidden_states=True)
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                text_embedding = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)

            if no_boseos_middle:
                if i == 0:
@@ -230,7 +248,10 @@ def get_unweighted_text_embeddings(
            text_embeddings.append(text_embedding)
        text_embeddings = torch.concat(text_embeddings, axis=1)
    else:
-        text_embeddings = pipe.text_encoder(text_input)[0]
+        if clip_skip is None:
+            clip_skip = 0
+        prompt_embeds = pipe.text_encoder(text_input, output_hidden_states=True)[-1][-(clip_skip + 1)]
+        text_embeddings = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
    return text_embeddings


@@ -242,6 +263,8 @@ def get_weighted_text_embeddings(
    no_boseos_middle: Optional[bool] = False,
    skip_parsing: Optional[bool] = False,
    skip_weighting: Optional[bool] = False,
+    clip_skip=None,
+    lora_scale=None,
 ):
    r"""
    Prompts can be assigned with local weights using brackets. For example,
@@ -268,6 +291,16 @@ def get_weighted_text_embeddings(
        skip_weighting (`bool`, *optional*, defaults to `False`):
            Skip the weighting. When the parsing is skipped, it is forced True.
    """
+    # set lora scale so that monkey patched LoRA
+    # function of text encoder can correctly access it
+    if lora_scale is not None and isinstance(pipe, StableDiffusionLoraLoaderMixin):
+        pipe._lora_scale = lora_scale
+
+        # dynamically adjust the LoRA scale
+        if not USE_PEFT_BACKEND:
+            adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
+        else:
+            scale_lora_layers(pipe.text_encoder, lora_scale)
    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
    if isinstance(prompt, str):
        prompt = [prompt]
@@ -334,10 +367,7 @@ def get_weighted_text_embeddings(

    # get the embeddings
    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle,
+        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, clip_skip=clip_skip
    )
    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
    if uncond_prompt is not None:
@@ -346,6 +376,7 @@ def get_weighted_text_embeddings(
            uncond_tokens,
            pipe.tokenizer.model_max_length,
            no_boseos_middle=no_boseos_middle,
+            clip_skip=clip_skip,
        )
        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)

@@ -362,6 +393,11 @@ def get_weighted_text_embeddings(
            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)

+    if pipe.text_encoder is not None:
+        if isinstance(pipe, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder, lora_scale)
+
    if uncond_prompt is not None:
        return text_embeddings, uncond_embeddings
    return text_embeddings, None
@@ -409,7 +445,11 @@ def preprocess_mask(mask, batch_size, scale_factor=8):


 class StableDiffusionLongPromptWeightingPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
+    FromSingleFileMixin,
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
@@ -545,6 +585,8 @@ class StableDiffusionLongPromptWeightingPipeline(
        max_embeddings_multiples=3,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        clip_skip: Optional[int] = None,
+        lora_scale: Optional[float] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -593,6 +635,8 @@ class StableDiffusionLongPromptWeightingPipeline(
                prompt=prompt,
                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
                max_embeddings_multiples=max_embeddings_multiples,
+                clip_skip=clip_skip,
+                lora_scale=lora_scale,
            )
            if prompt_embeds is None:
                prompt_embeds = prompt_embeds1
@@ -786,6 +830,7 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        clip_skip: Optional[int] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -861,6 +906,9 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -899,6 +947,7 @@ class StableDiffusionLongPromptWeightingPipeline(
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
+        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None

        # 3. Encode input prompt
        prompt_embeds = self._encode_prompt(
@@ -910,6 +959,8 @@ class StableDiffusionLongPromptWeightingPipeline(
            max_embeddings_multiples,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -1040,6 +1091,7 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        clip_skip=None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -1097,6 +1149,9 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -1131,6 +1186,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            return_dict=return_dict,
            callback=callback,
            is_cancelled_callback=is_cancelled_callback,
+            clip_skip=clip_skip,
            callback_steps=callback_steps,
            cross_attention_kwargs=cross_attention_kwargs,
        )
@@ -22,19 +22,28 @@ from transformers import (

 from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
+    USE_PEFT_BACKEND,
    deprecate,
    is_accelerate_available,
    is_accelerate_version,
    is_invisible_watermark_available,
    logging,
    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -256,6 +265,7 @@ def get_weighted_text_embeddings_sdxl(
    num_images_per_prompt: int = 1,
    device: Optional[torch.device] = None,
    clip_skip: Optional[int] = None,
+    lora_scale: Optional[int] = None,
 ):
    """
    This function can process long prompt with weights, no length limitation
@@ -276,6 +286,24 @@ def get_weighted_text_embeddings_sdxl(
    """
    device = device or pipe._execution_device

+    # set lora scale so that monkey patched LoRA
+    # function of text encoder can correctly access it
+    if lora_scale is not None and isinstance(pipe, StableDiffusionXLLoraLoaderMixin):
+        pipe._lora_scale = lora_scale
+
+        # dynamically adjust the LoRA scale
+        if pipe.text_encoder is not None:
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(pipe.text_encoder, lora_scale)
+
+        if pipe.text_encoder_2 is not None:
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(pipe.text_encoder_2, lora_scale)
+            else:
+                scale_lora_layers(pipe.text_encoder_2, lora_scale)
+
    if prompt_2:
        prompt = f"{prompt} {prompt_2}"

@@ -424,6 +452,16 @@ def get_weighted_text_embeddings_sdxl(
        bs_embed * num_images_per_prompt, -1
    )

+    if pipe.text_encoder is not None:
+        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder, lora_scale)
+
+    if pipe.text_encoder_2 is not None:
+        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(pipe.text_encoder_2, lora_scale)
+
    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds


@@ -544,7 +582,7 @@ class SDXLLongPromptWeightingPipeline(
    StableDiffusionMixin,
    FromSingleFileMixin,
    IPAdapterMixin,
-    LoraLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
    TextualInversionLoaderMixin,
 ):
    r"""
@@ -556,8 +594,8 @@ class SDXLLongPromptWeightingPipeline(
    The pipeline also inherits the following loading methods:
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings

    Args:
@@ -738,7 +776,7 @@ class SDXLLongPromptWeightingPipeline(

        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
            self._lora_scale = lora_scale

        if prompt is not None and isinstance(prompt, str):
@@ -1607,7 +1645,9 @@ class SDXLLongPromptWeightingPipeline(
                image_embeds = torch.cat([negative_image_embeds, image_embeds])

        # 3. Encode input prompt
-        (self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None)
+        lora_scale = (
+            self._cross_attention_kwargs.get("scale", None) if self._cross_attention_kwargs is not None else None
+        )

        negative_prompt = negative_prompt if negative_prompt is not None else ""

@@ -1622,6 +1662,7 @@ class SDXLLongPromptWeightingPipeline(
            neg_prompt=negative_prompt,
            num_images_per_prompt=num_images_per_prompt,
            clip_skip=clip_skip,
+            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -43,8 +43,7 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
-
+check_min_version("0.30.0")

 class MarigoldDepthOutput(BaseOutput):
    """
@@ -9,7 +9,7 @@ import torch
 from numpy import exp, pi, sqrt
 from torchvision.transforms.functional import resize
 from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -275,7 +275,7 @@ class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin):
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler],
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -15,7 +15,7 @@ from diffusers.utils import logging

 try:
    from ligo.segments import segment
-    from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+    from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 except ImportError:
    raise ImportError("Please install transformers and ligo-segments to use the mixture pipeline")

@@ -144,7 +144,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, PNDMScheduler],
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -22,7 +22,7 @@ from PIL import Image
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unets.unet_motion_model import MotionAdapter
@@ -114,7 +114,11 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):


 class AnimateDiffControlNetPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
 ):
    r"""
    Pipeline for text-to-video generation.
@@ -124,8 +128,8 @@ class AnimateDiffControlNetPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
@@ -234,7 +238,7 @@ class AnimateDiffControlNetPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -366,7 +370,7 @@ class AnimateDiffControlNetPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -27,7 +27,7 @@ import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unet_motion_model import MotionAdapter
@@ -240,7 +240,11 @@ def retrieve_timesteps(


 class AnimateDiffImgToVideoPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
 ):
    r"""
    Pipeline for image-to-video generation.
@@ -250,8 +254,8 @@ class AnimateDiffImgToVideoPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
@@ -351,7 +355,7 @@ class AnimateDiffImgToVideoPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -483,7 +487,7 @@ class AnimateDiffImgToVideoPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -12,7 +12,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokeniz
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import (
    FromSingleFileMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
@@ -89,7 +89,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):


 class DemoFusionSDXLPipeline(
-    DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -231,7 +235,7 @@ class DemoFusionSDXLPipeline(

        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -21,7 +21,7 @@ from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, UNet2DConditionModel
 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models.attention import BasicTransformerBlock
 from diffusers.models.attention_processor import LoRAAttnProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
@@ -222,7 +222,7 @@ class FabricPipeline(DiffusionPipeline):
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

        if prompt is not None and isinstance(prompt, str):
@@ -35,7 +35,7 @@ from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import (
    FromSingleFileMixin,
    IPAdapterMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 )
 from diffusers.models.attention import Attention
@@ -75,7 +75,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 class Prompt2PromptPipeline(
    DiffusionPipeline,
    TextualInversionLoaderMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    IPAdapterMixin,
    FromSingleFileMixin,
 ):
@@ -87,8 +87,8 @@ class Prompt2PromptPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -286,7 +286,7 @@ class Prompt2PromptPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -420,7 +420,7 @@ class Prompt2PromptPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -27,7 +27,12 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV

 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import Attention, FusedAttnProcessor2_0
 from diffusers.models.lora import adjust_lora_scale_text_encoder
@@ -358,7 +363,7 @@ def retrieve_timesteps(


 class StableDiffusionBoxDiffPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion with BoxDiff.
@@ -368,8 +373,8 @@ class StableDiffusionBoxDiffPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -594,7 +599,7 @@ class StableDiffusionBoxDiffPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -726,7 +731,7 @@ class StableDiffusionBoxDiffPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -11,7 +11,12 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV

 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import Attention, AttnProcessor2_0, FusedAttnProcessor2_0
 from diffusers.models.lora import adjust_lora_scale_text_encoder
@@ -328,7 +333,7 @@ def retrieve_timesteps(


 class StableDiffusionPAGPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion.
@@ -336,8 +341,8 @@ class StableDiffusionPAGPipeline(
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
    Args:
@@ -560,7 +565,7 @@ class StableDiffusionPAGPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -692,7 +697,7 @@ class StableDiffusionPAGPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers import DiffusionPipeline
 from diffusers.image_processor import PipelineDepthInput, PipelineImageInput, VaeImageProcessorLDM3D
-from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -69,7 +69,7 @@ EXAMPLE_DOC_STRING = """


 class StableDiffusionUpscaleLDM3DPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image and 3D generation using LDM3D.
@@ -79,8 +79,8 @@ class StableDiffusionUpscaleLDM3DPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files

    Args:
@@ -233,7 +233,7 @@ class StableDiffusionUpscaleLDM3DPipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -365,7 +365,7 @@ class StableDiffusionUpscaleLDM3DPipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

@@ -189,7 +189,7 @@ class StableDiffusionXLControlNetAdapterPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -33,7 +33,7 @@ from diffusers import DiffusionPipeline
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import (
    FromSingleFileMixin,
-    LoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    StableDiffusionXLLoraLoaderMixin,
    TextualInversionLoaderMixin,
 )
@@ -300,7 +300,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):


 class StableDiffusionXLControlNetAdapterInpaintPipeline(
-    DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin
+    DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, StableDiffusionLoraLoaderMixin
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
@@ -332,7 +332,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
@@ -178,11 +178,11 @@ class StableDiffusionXLDifferentialImg2ImgPipeline(

    In addition the pipeline inherits the following loading methods:
        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]
        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]

    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`]

    Args:
        vae ([`AutoencoderKL`]):
@@ -1002,7 +1002,7 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -991,7 +991,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -9,7 +9,7 @@ import numpy as np
 import PIL.Image
 import torch
 from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

 # from ...configuration_utils import FrozenDict
 # from ...models import AutoencoderKL, UNet2DConditionModel
@@ -87,7 +87,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
        cc_projection ([`CCProjection`]):
            Projection layer to project the concated CLIP features and pose embeddings to the original CLIP feature size.
@@ -102,7 +102,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        cc_projection: CCProjection,
        requires_safety_checker: bool = True,
    ):
@@ -3,7 +3,7 @@ from typing import Dict, Optional

 import torch
 import torchvision.transforms.functional as FF
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers import StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
@@ -69,7 +69,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        requires_safety_checker: bool = True,
    ):
        super().__init__(
@@ -864,7 +864,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
                )

                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -1038,7 +1038,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
                        )

                        if guess_mode and do_classifier_free_guidance:
-                            # Infered ControlNet only for the conditional batch.
+                            # Inferred ControlNet only for the conditional batch.
                            # To apply the output of ControlNet to both the unconditional and conditional batches,
                            # add 0 to the unconditional batch to keep it unchanged.
                            down_block_res_samples = [
@@ -11,7 +11,7 @@ from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer

 from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
-from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
+from diffusers.loaders import AttnProcsLayers, StableDiffusionLoraLoaderMixin
 from diffusers.models.attention_processor import (
    AttnAddedKVProcessor,
    AttnAddedKVProcessor2_0,
@@ -321,7 +321,7 @@ class SdeDragPipeline(DiffusionPipeline):
            optimizer.zero_grad()

        with tempfile.TemporaryDirectory() as save_lora_dir:
-            LoraLoaderMixin.save_lora_weights(
+            StableDiffusionLoraLoaderMixin.save_lora_weights(
                save_directory=save_lora_dir,
                unet_lora_layers=unet_lora_layers,
                text_encoder_lora_layers=None,
@@ -752,7 +752,7 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli
                )

                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -18,10 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import intel_extension_for_pytorch as ipex
 import torch
 from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.configuration_utils import FrozenDict
-from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -61,7 +61,7 @@ EXAMPLE_DOC_STRING = """


 class StableDiffusionIPEXPipeline(
-    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion on IPEX.
@@ -86,7 +86,7 @@ class StableDiffusionIPEXPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -100,7 +100,7 @@ class StableDiffusionIPEXPipeline(
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -11,7 +11,12 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
 from diffusers.configuration_utils import FrozenDict, deprecate
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
 from diffusers.models.attention import BasicTransformerBlock
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
@@ -76,7 +81,7 @@ def torch_dfs(model: torch.nn.Module):


 class StableDiffusionReferencePipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for Stable Diffusion Reference.
@@ -86,8 +91,8 @@ class StableDiffusionReferencePipeline(

    The pipeline also inherits the following loading methods:
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-    - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-    - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
    - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
    - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

@@ -443,7 +448,7 @@ class StableDiffusionReferencePipeline(
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
@@ -575,7 +580,7 @@ class StableDiffusionReferencePipeline(
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+        if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
            # Retrieve the original scale by scaling back the LoRA layers
            unscale_lora_layers(self.text_encoder, lora_scale)

--- a/Show More
+++ b/Show More