Merge branch 'main' into fix-test

fix
2024-03-31 22:07:28 -10:00 · 2024-04-01 07:52:45 +00:00
285 changed files with 3974 additions and 21883 deletions
@@ -31,6 +31,7 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install pandas peft
@@ -20,7 +20,7 @@ env:

 jobs:
  test-build-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -50,7 +50,7 @@ jobs:
        if: steps.file_changes.outputs.all != ''

  build-and-push-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request'
    
    permissions:
@@ -73,13 +73,13 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ env.REGISTRY }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
      - name: Build and push
        uses: docker/build-push-action@v3
        with:
@@ -1,7 +1,6 @@
-name: Nightly and release tests on main/release branch
+name: Nightly tests on main

 on:
-  workflow_dispatch:
  schedule:
    - cron: "0 0 * * *" # every day at midnight

@@ -70,6 +69,7 @@ jobs:
      
      - name: Install dependencies
        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -130,6 +130,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -200,6 +201,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -243,8 +245,6 @@ jobs:
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on: docker-tpu
-    if: github.event_name == 'schedule'
-    
    container:
      image: diffusers/diffusers-flax-tpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -259,6 +259,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -354,7 +355,6 @@ jobs:
  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]
-    if: github.event_name == 'schedule'

    steps:
      - name: Checkout diffusers
@@ -32,6 +32,7 @@ jobs:
        fetch-depth: 0
    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
    - name: Environment
@@ -88,6 +89,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pip install -e [quality,test]
        python -m pip install accelerate
@@ -145,6 +147,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m pip install -e [quality,test]

@@ -32,7 +32,9 @@ jobs:
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
-        run: make quality
+        run: |
+          ruff check examples tests src utils scripts
+          ruff format examples tests src utils scripts --check
      - name: Check if failure
        if: ${{ failure() }}
        run: |
@@ -51,7 +53,7 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
-      - name: Check repo consistency
+      - name: Check quality
        run: |
          python utils/check_copies.py
          python utils/check_dummies.py
@@ -71,7 +73,7 @@ jobs:

    name: LoRA - ${{ matrix.lib-versions }}

-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on: docker-cpu

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -89,10 +91,11 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        if [ "${{ matrix.lib-versions }}" == "main" ]; then
-            python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+            python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git
            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
            python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        else
@@ -107,7 +110,7 @@ jobs:
    - name: Run fast PyTorch LoRA CPU tests with PEFT backend
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/lora/
@@ -40,7 +40,9 @@ jobs:
          python -m pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
-        run: make quality
+        run: |
+          ruff check examples tests src utils scripts
+          ruff format examples tests src utils scripts --check
      - name: Check if failure
        if: ${{ failure() }}
        run: |
@@ -59,7 +61,7 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[quality]
-      - name: Check repo consistency
+      - name: Check quality
        run: |
          python utils/check_copies.py
          python utils/check_dummies.py
@@ -77,22 +79,22 @@ jobs:
        config:
          - name: Fast PyTorch Pipeline CPU tests
            framework: pytorch_pipelines
-            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -116,6 +118,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate
@@ -129,7 +132,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/pipelines
@@ -138,7 +141,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_models' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx and not Dependency" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/models tests/schedulers tests/others
@@ -147,7 +150,7 @@ jobs:
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests
@@ -157,7 +160,7 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install peft
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

@@ -180,7 +183,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -204,6 +207,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]

@@ -60,7 +60,7 @@ jobs:
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -69,14 +69,9 @@ jobs:
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
-      - name: Tailscale
-        uses: huggingface/tailscale-action@v1
-        with:
-          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
-          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
-          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
      - name: Install dependencies
        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -93,12 +88,6 @@ jobs:
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            tests/pipelines/${{ matrix.module }}
-      - name: Tailscale Wait
-        if: ${{ failure() || runner.debug == '1' }}
-        uses: huggingface/tailscale-action@v1
-        with:
-           waitForSSH: true
-           authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
      - name: Failure short reports
        if: ${{ failure() }}
        run: |
@@ -132,6 +121,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -181,10 +171,11 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git

    - name: Environment
      run: |
@@ -231,6 +222,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -278,6 +270,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
@@ -437,4 +430,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: examples_test_reports
-        path: reports
+        path: reports
@@ -29,22 +29,22 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: docker-cpu
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -68,6 +68,7 @@ jobs:

    - name: Install dependencies
      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]

@@ -80,7 +81,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -89,7 +90,7 @@ jobs:
      if: ${{ matrix.config.framework == 'flax' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Flax" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -98,7 +99,7 @@ jobs:
      if: ${{ matrix.config.framework == 'onnxruntime' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -108,7 +109,7 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install peft
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

@@ -1,30 +0,0 @@
-name: Update Diffusers metadata
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - main
-      - update_diffusers_metadata*
-
-jobs:
-  update_metadata:
-    runs-on: ubuntu-22.04
-    defaults:
-      run:
-        shell: bash -l {0}
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Setup environment
-        run: |
-          pip install --upgrade pip
-          pip install datasets pandas
-          pip install .[torch]
-
-      - name: Update metadata
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
-        run: |
-          python utils/update_metadata.py --commit_sha ${{ github.sha }}
@@ -42,7 +42,6 @@ repo-consistency:
 quality:
 	ruff check $(check_dirs) setup.py
 	ruff format --check $(check_dirs) setup.py
-	doc-builder style src/diffusers docs/source --max_len 119 --check_only
 	python utils/check_doc_toc.py

 # Format source code automatically and check is there are any problems left that need manual fixing
@@ -56,7 +55,6 @@ extra_style_checks:
 style:
 	ruff check $(check_dirs) setup.py --fix
 	ruff format $(check_dirs) setup.py
-	doc-builder style src/diffusers docs/source --max_len 119
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks

@@ -12,7 +12,6 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
-                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,7 +12,6 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
-                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,7 +12,6 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
-                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -12,7 +12,6 @@ RUN apt update && \
                   curl \
                   ca-certificates \
                   libsndfile1-dev \
-                   libgl1 \
                   python3.8 \
                   python3-pip \
                   python3.8-venv && \
@@ -24,12 +24,14 @@
  title: Tutorials
 - sections:
  - sections:
+    - local: using-diffusers/loading_overview
+      title: Overview
    - local: using-diffusers/loading
-      title: Load pipelines
+      title: Load pipelines, models, and schedulers
+    - local: using-diffusers/schedulers
+      title: Load and compare different schedulers
    - local: using-diffusers/custom_pipeline_overview
      title: Load community pipelines and components
-    - local: using-diffusers/schedulers
-      title: Load schedulers and models
    - local: using-diffusers/using_safetensors
      title: Load safetensors
    - local: using-diffusers/other-formats
@@ -69,7 +71,7 @@
    - local: using-diffusers/control_brightness
      title: Control image brightness
    - local: using-diffusers/weighted_prompts
-      title: Prompt techniques
+      title: Prompt weighting
    - local: using-diffusers/freeu
      title: Improve generation quality with FreeU
    title: Techniques
@@ -84,8 +86,6 @@
      title: Kandinsky
    - local: using-diffusers/controlnet
      title: ControlNet
-    - local: using-diffusers/t2i_adapter
-      title: T2I-Adapter
    - local: using-diffusers/shap-e
      title: Shap-E
    - local: using-diffusers/diffedit
@@ -170,8 +170,6 @@
      title: Token merging
    - local: optimization/deepcache
      title: DeepCache
-    - local: optimization/tgate
-      title: TGATE
    title: General optimizations
  - sections:
    - local: using-diffusers/stable_diffusion_jax_how_to
@@ -282,10 +280,6 @@
      title: ControlNet
    - local: api/pipelines/controlnet_sdxl
      title: ControlNet with Stable Diffusion XL
-    - local: api/pipelines/controlnetxs
-      title: ControlNet-XS
-    - local: api/pipelines/controlnetxs_sdxl
-      title: ControlNet-XS with Stable Diffusion XL
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -364,7 +358,7 @@
      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
      - local: api/pipelines/stable_diffusion/adapter
-        title: T2I-Adapter
+        title: Stable Diffusion T2I-Adapter
      - local: api/pipelines/stable_diffusion/gligen
        title: GLIGEN (Grounded Language-to-Image Generation)
      title: Stable Diffusion
@@ -20,8 +20,7 @@ The abstract of the paper is the following:

 *Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).*

-This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be 
-found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). 
+This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).

 ## Tips

@@ -37,8 +36,6 @@ See table below for details on the three checkpoints:
 | [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 350M            | 1.1B             | 1150k             |
 | [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M            | 1.5B             | 1150k             |
 | [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M            | 1.1B             | 665k              |
-| [audioldm2-gigaspeech](https://huggingface.co/anhnct/audioldm2_gigaspeech) | Text-to-speech | 350M            | 1.1B             |10k              |
-| [audioldm2-ljspeech](https://huggingface.co/anhnct/audioldm2_ljspeech) | Text-to-speech | 350M            | 1.1B             |              |

 ### Constructing a prompt

@@ -56,7 +53,7 @@ See table below for details on the three checkpoints:
 * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation.
 * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.

-The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
+The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).

 <Tip>

@@ -10,7 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# T2I-Adapter
+# Text-to-Image Generation with Adapter Conditioning
+
+## Overview

 [T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.08453) by Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, Xiaohu Qie.

@@ -22,26 +24,236 @@ The abstract of the paper is the following:

 This model was contributed by the community contributor [HimariO](https://github.com/HimariO) ❤️ .

-## StableDiffusionAdapterPipeline
+## Available Pipelines:

+| Pipeline | Tasks | Demo
+|---|---|:---:|
+| [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | -
+| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | -
+
+## Usage example with the base model of StableDiffusion-1.4/1.5
+
+In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
+All adapters use the same pipeline.
+
+ 1. Images are first converted into the appropriate *control image* format.
+ 2. The *control image* and *prompt* are passed to the [`StableDiffusionAdapterPipeline`].
+
+Let's have a look at a simple example using the [Color Adapter](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1).
+
+```python
+from diffusers.utils import load_image, make_image_grid
+
+image = load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png")
+```
+
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png)
+
+
+Then we can create our color palette by simply resizing it to 8 by 8 pixels and then scaling it back to original size.
+
+```python
+from PIL import Image
+
+color_palette = image.resize((8, 8))
+color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
+```
+
+Let's take a look at the processed image.
+
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_palette.png)
+
+
+Next, create the adapter pipeline
+
+```py
+import torch
+from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
+
+adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
+pipe = StableDiffusionAdapterPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    adapter=adapter,
+    torch_dtype=torch.float16,
+)
+pipe.to("cuda")
+```
+
+Finally, pass the prompt and control image to the pipeline
+
+```py
+# fix the random seed, so you will get the same result as the example
+generator = torch.Generator("cuda").manual_seed(7)
+
+out_image = pipe(
+    "At night, glowing cubes in front of the beach",
+    image=color_palette,
+    generator=generator,
+).images[0]
+make_image_grid([image, color_palette, out_image], rows=1, cols=3)
+```
+
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_output.png)
+
+## Usage example with the base model of StableDiffusion-XL
+
+In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-XL.
+All adapters use the same pipeline.
+
+ 1. Images are first downloaded into the appropriate *control image* format.
+ 2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`].
+
+Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0).
+
+```python
+from diffusers.utils import load_image, make_image_grid
+
+sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
+```
+
+![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png)
+
+Then, create the adapter pipeline
+
+```py
+import torch
+from diffusers import (
+    T2IAdapter,
+    StableDiffusionXLAdapterPipeline,
+    DDPMScheduler
+)
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0", torch_dtype=torch.float16, adapter_type="full_adapter_xl")
+scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
+)
+
+pipe.to("cuda")
+```
+
+Finally, pass the prompt and control image to the pipeline
+
+```py
+# fix the random seed, so you will get the same result as the example
+generator = torch.Generator().manual_seed(42)
+
+sketch_image_out = pipe(
+    prompt="a photo of a dog in real world, high quality",
+    negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
+    image=sketch_image,
+    generator=generator,
+    guidance_scale=7.5
+).images[0]
+make_image_grid([sketch_image, sketch_image_out], rows=1, cols=2)
+```
+
+![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch_output.png)
+
+## Available checkpoints
+
+Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models).
+
+### T2I-Adapter with Stable Diffusion 1.4
+
+| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
+|---|---|---|---|
+|[TencentARC/t2iadapter_color_sd14v1](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1)<br/> *Trained with spatial color palette* | An image with 8x8 color palette.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_canny_sd14v1](https://huggingface.co/TencentARC/t2iadapter_canny_sd14v1)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_sketch_sd14v1](https://huggingface.co/TencentARC/t2iadapter_sketch_sd14v1)<br/> *Trained with [PidiNet](https://github.com/zhuoinoulu/pidinet) edge detection* | A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_depth_sd14v1](https://huggingface.co/TencentARC/t2iadapter_depth_sd14v1)<br/> *Trained with Midas depth estimation*  | A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_openpose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_openpose_sd14v1)<br/> *Trained with OpenPose bone image*  | A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_keypose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_keypose_sd14v1)<br/> *Trained with mmpose skeleton image*  | A [mmpose skeleton](https://github.com/open-mmlab/mmpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"/></a>|
+|[TencentARC/t2iadapter_seg_sd14v1](https://huggingface.co/TencentARC/t2iadapter_seg_sd14v1)<br/>*Trained with semantic segmentation*  | An [custom](https://github.com/TencentARC/T2I-Adapter/discussions/25) segmentation protocol image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"/></a> |
+|[TencentARC/t2iadapter_canny_sd15v2](https://huggingface.co/TencentARC/t2iadapter_canny_sd15v2)||
+|[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)||
+|[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)||
+|[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)||
+|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)||
+|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)||
+|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)||
+
+## Combining multiple adapters
+
+[`MultiAdapter`] can be used for applying multiple conditionings at once.
+
+Here we use the keypose adapter for the character posture and the depth adapter for creating the scene.
+
+```py
+from diffusers.utils import load_image, make_image_grid
+
+cond_keypose = load_image(
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
+)
+cond_depth = load_image(
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
+)
+cond = [cond_keypose, cond_depth]
+
+prompt = ["A man walking in an office room with a nice view"]
+```
+
+The two control images look as such:
+
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png)
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png)
+
+
+`MultiAdapter` combines keypose and depth adapters.
+
+`adapter_conditioning_scale` balances the relative influence of the different adapters.
+
+```py
+import torch
+from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
+
+adapters = MultiAdapter(
+    [
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
+    ]
+)
+adapters = adapters.to(torch.float16)
+
+pipe = StableDiffusionAdapterPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    torch_dtype=torch.float16,
+    adapter=adapters,
+).to("cuda")
+
+image = pipe(prompt, cond, adapter_conditioning_scale=[0.8, 0.8]).images[0]
+make_image_grid([cond_keypose, cond_depth, image], rows=1, cols=3)
+```
+
+![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_depth_sample_output.png)
+
+
+## T2I-Adapter vs ControlNet
+
+T2I-Adapter is similar to [ControlNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet).
+T2I-Adapter uses a smaller auxiliary network which is only run once for the entire diffusion process.
+However, T2I-Adapter performs slightly worse than ControlNet.
+
+## StableDiffusionAdapterPipeline
 [[autodoc]] StableDiffusionAdapterPipeline
-    - all
-    - __call__
-    - enable_attention_slicing
-    - disable_attention_slicing
-    - enable_vae_slicing
-    - disable_vae_slicing
-    - enable_xformers_memory_efficient_attention
-    - disable_xformers_memory_efficient_attention
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention

 ## StableDiffusionXLAdapterPipeline
-
 [[autodoc]] StableDiffusionXLAdapterPipeline
-    - all
-    - __call__
-    - enable_attention_slicing
-    - disable_attention_slicing
-    - enable_vae_slicing
-    - disable_vae_slicing
-    - enable_xformers_memory_efficient_attention
-    - disable_xformers_memory_efficient_attention
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -1,182 +0,0 @@
-# T-GATE
-
-[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
-
-Before you begin, make sure you install T-GATE.
-
-```bash
-pip install tgate
-pip install -U pytorch diffusers transformers accelerate DeepCache
-```
-
-
-To use T-GATE with a pipeline, you need to use its corresponding loader.
-
-| Pipeline | T-GATE Loader |
-|---|---|
-| PixArt | TgatePixArtLoader |
-| Stable Diffusion XL | TgateSDXLLoader |
-| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
-| Stable Diffusion | TgateSDLoader |
-| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader |
-
-Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
-
-Let's see how to enable this for several different pipelines.
-
-<hfoptions id="pipelines">
-<hfoption id="PixArt">
-
-Accelerate `PixArtAlphaPipeline` with T-GATE:
-
-```py
-import torch
-from diffusers import PixArtAlphaPipeline
-from tgate import TgatePixArtLoader
-
-pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
-
-gate_step = 8
-inference_step = 25
-pipe = TgatePixArtLoader(
-       pipe,
-       gate_step=gate_step,
-       num_inference_steps=inference_step,
-).to("cuda")
-
-image = pipe.tgate(
-       "An alpaca made of colorful building blocks, cyberpunk.",
-        gate_step=gate_step,
-       num_inference_steps=inference_step,
-).images[0]
-```
-</hfoption>
-<hfoption id="Stable Diffusion XL"> 
-
-Accelerate `StableDiffusionXLPipeline` with T-GATE:
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-from diffusers import DPMSolverMultistepScheduler
-from tgate import TgateSDXLLoader
-
-pipe = StableDiffusionXLPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            torch_dtype=torch.float16,
-            variant="fp16",
-            use_safetensors=True,
-)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-gate_step = 10
-inference_step = 25
-pipe = TgateSDXLLoader(
-       pipe,
-       gate_step=gate_step,
-       num_inference_steps=inference_step,
-).to("cuda")
-
-image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
-).images[0]
-```
-</hfoption>
-<hfoption id="StableDiffusionXL with DeepCache">
-
-Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE:
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-from diffusers import DPMSolverMultistepScheduler
-from tgate import TgateSDXLDeepCacheLoader
-
-pipe = StableDiffusionXLPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            torch_dtype=torch.float16,
-            variant="fp16",
-            use_safetensors=True,
-)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-gate_step = 10
-inference_step = 25
-pipe = TgateSDXLDeepCacheLoader(
-       pipe,
-       cache_interval=3,
-       cache_branch_id=0,
-).to("cuda")
-
-image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
-).images[0]
-```
-</hfoption>
-<hfoption id="Latent Consistency Model">
-
-Accelerate `latent-consistency/lcm-sdxl` with T-GATE:
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-from diffusers import UNet2DConditionModel, LCMScheduler
-from diffusers import DPMSolverMultistepScheduler
-from tgate import TgateSDXLLoader
-
-unet = UNet2DConditionModel.from_pretrained(
-    "latent-consistency/lcm-sdxl",
-    torch_dtype=torch.float16,
-    variant="fp16",
-)
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    unet=unet,
-    torch_dtype=torch.float16,
-    variant="fp16",
-)
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-gate_step = 1
-inference_step = 4
-pipe = TgateSDXLLoader(
-       pipe,
-       gate_step=gate_step,
-       num_inference_steps=inference_step,
-       lcm=True
-).to("cuda")
-
-image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
-).images[0]
-```
-</hfoption>
-</hfoptions>
-
-T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
-
-## Benchmarks
-| Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |
-|-----------------------|----------|-----------|---------|---------------------------|
-| SD-1.5                | 16.938T  | 859.520M  | 7.032s  | 23.927                    |
-| SD-1.5 w/ T-GATE       | 9.875T   | 815.557M  | 4.313s  | 20.789                    |
-| SD-2.1                | 38.041T  | 865.785M  | 16.121s | 22.609                    |
-| SD-2.1 w/ T-GATE       | 22.208T  | 815.433 M | 9.878s  | 19.940                    |
-| SD-XL                 | 149.438T | 2.570B    | 53.187s | 24.628                    |
-| SD-XL w/ T-GATE        | 84.438T  | 2.024B    | 27.932s | 22.738                    |
-| Pixart-Alpha          | 107.031T | 611.350M  | 61.502s | 38.669                    |
-| Pixart-Alpha w/ T-GATE | 65.318T  | 462.585M  | 37.867s | 35.825                    |
-| DeepCache (SD-XL)     | 57.888T  | -         | 19.931s | 23.755                    |
-| DeepCache w/ T-GATE    | 43.868T  | -         | 14.666s | 23.999                    |
-| LCM (SD-XL)           | 11.955T  | 2.570B    | 3.805s  | 25.044                    |
-| LCM w/ T-GATE          | 11.171T  | 2.024B    | 3.533s  | 25.028                    |
-| LCM (Pixart-Alpha)    | 8.563T   | 611.350M  | 4.733s  | 36.086                    |
-| LCM w/ T-GATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
-
-The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).
@@ -52,76 +52,6 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h

 </Tip>

-### Device placement
-
-> [!WARNING]
-> This feature is experimental and its APIs might change in the future. 
-
-With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
-
-For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
-
-* it only works on a single GPU
-* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
-
-To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
-
-> [!WARNING]
-> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
-
-```diff
-from diffusers import DiffusionPipeline
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
-+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
-)
-image = pipeline("a dog").images[0]
-image
-```
-
-You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
-
-```diff
-from diffusers import DiffusionPipeline
-import torch
-
-max_memory = {0:"1GB", 1:"1GB"}
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16, 
-    use_safetensors=True, 
-    device_map="balanced",
-+   max_memory=max_memory
-)
-image = pipeline("a dog").images[0]
-image
-```
-
-If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. 
-
-By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
-
-Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
-
-```py
-pipeline.reset_device_map()
-```
-
-Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
-
-```py
-print(pipeline.hf_device_map)
-```
-
-An example device map would look like so:
-
-
-```bash
-{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
-```
-
 ## PyTorch Distributed

 PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
@@ -148,9 +148,9 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
    use_safetensors=True
 ).to("cuda")

-image = pipeline(
-    prompt="A croissant shaped like a cute bear.",
-    negative_prompt="Deformed, ugly, bad anatomy",
+image = pipe(
+    prompt = "A croissant shaped like a cute bear."
+    negative_prompt = "Deformed, ugly, bad anatomy"
    callback_on_step_end=decode_tensors,
    callback_on_step_end_tensor_inputs=["latents"],
 ).images[0]
@@ -16,19 +16,17 @@ specific language governing permissions and limitations under the License.

 ## Community pipelines

-Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
+Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.

-There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
+There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).

-There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines.
+To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32):

-<hfoptions id="community">
-<hfoption id="Hub pipelines">
+<Tip warning={true}>

-To load a Hugging Face Hub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32):
+🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!

-> [!WARNING]
-> By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
+</Tip>

 ```py
 from diffusers import DiffusionPipeline
@@ -38,10 +36,7 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```

-</hfoption>
-<hfoption id="GitHub pipelines">
-
-To load a GitHub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you you'd like to load the pipeline weights and components from. You can also load model components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline and the CLIP model components.
+Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it:

 ```py
 from diffusers import DiffusionPipeline
@@ -61,12 +56,9 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```

-</hfoption>
-</hfoptions>
-
 ### Load from a local file

-Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a pipeline.py file that contains the pipeline class.
+Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a `pipeline.py` file that contains the pipeline class in order to successfully load it.

 ```py
 pipeline = DiffusionPipeline.from_pretrained(
@@ -85,7 +77,7 @@ By default, community pipelines are loaded from the latest stable version of Dif
 <hfoptions id="version">
 <hfoption id="main">

-For example, to load from the main branch:
+For example, to load from the `main` branch:

 ```py
 pipeline = DiffusionPipeline.from_pretrained(
@@ -101,7 +93,7 @@ pipeline = DiffusionPipeline.from_pretrained(
 </hfoption>
 <hfoption id="older version">

-For example, to load from a previous version of Diffusers like v0.25.0:
+For example, to load from a previous version of Diffusers like `v0.25.0`:

 ```py
 pipeline = DiffusionPipeline.from_pretrained(
@@ -117,49 +109,8 @@ pipeline = DiffusionPipeline.from_pretrained(
 </hfoption>
 </hfoptions>

-### Load with from_pipe

-Community pipelines can also be loaded with the [`~DiffusionPipeline.from_pipe`] method which allows you to load and reuse multiple pipelines without any additional memory overhead (learn more in the [Reuse a pipeline](./loading#reuse-a-pipeline) guide). The memory requirement is determined by the largest single pipeline loaded.
-
-For example, let's load a community pipeline that supports [long prompts with weighting](https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion) from a Stable Diffusion pipeline.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipe_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16)
-pipe_sd.to("cuda")
-# load long prompt weighting pipeline
-pipe_lpw = DiffusionPipeline.from_pipe(
-    pipe_sd,
-    custom_pipeline="lpw_stable_diffusion",
-).to("cuda")
-
-prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, macro shot, colorful details, natural lighting, amazing composition, subsurface scattering, amazing textures, filmic, soft light, ultra-detailed eyes, intricate details, detailed texture, light source contrast, dramatic shadows, cinematic light, depth of field, film grain, noise, dark background, hyperrealistic dslr film still, dim volumetric cinematic lighting"
-neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
-generator = torch.Generator(device="cpu").manual_seed(20)
-out_lpw = pipe_lpw(
-    prompt, 
-    negative_prompt=neg_prompt, 
-    width=512,
-    height=512,
-    max_embeddings_multiples=3, 
-    num_inference_steps=50,
-    generator=generator,
-    ).images[0]
-out_lpw
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_lpw.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion with long prompt weighting</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_non_lpw.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion</figcaption>
-  </div>
-</div>
+For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide!

 ## Community components

@@ -167,7 +118,7 @@ Community components allow users to build pipelines that may have customized com

 This section shows how users should use community components to build a community pipeline.

-You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example.
+You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. So, let's start loading the components:

 1. Import and load the text encoder from Transformers:

@@ -201,17 +152,17 @@ In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/

 </Tip>

-4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the [`UNet3DConditionModel`] class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in showone_unet_3d_condition.py.
+4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in the `showone_unet_3d_condition.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the `UNet3DConditionModel` class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in the `showone_unet_3d_condition.py` script.

-    Once this is done, you can initialize the UNet:
+Once this is done, you can initialize the UNet:

-    ```python
-    from showone_unet_3d_condition import ShowOneUNet3DConditionModel
+```python
+from showone_unet_3d_condition import ShowOneUNet3DConditionModel

-    unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
-    ```
+unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
+```

-5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in pipeline_t2v_base_pixel.py.
+5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in the `pipeline_t2v_base_pixel.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in the `pipeline_t2v_base_pixel.py` script. 

 Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`:

@@ -236,16 +187,13 @@ Push the pipeline to the Hub to share with the community!
 pipeline.push_to_hub("custom-t2v-pipeline")
 ```

-After the pipeline is successfully pushed, you need to make a few changes:
+After the pipeline is successfully pushed, you need a couple of changes:

-1. Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
-2. Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder.
-3. Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main).
+1. Change the `_class_name` attribute in [`model_index.json`](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
+2. Upload `showone_unet_3d_condition.py` to the `unet` [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).
+3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).

-To run inference, add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
-
-> [!WARNING]
-> As an additional precaution with `trust_remote_code=True`, we strongly encourage you to pass a commit hash to the `revision` parameter in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with some malicious new lines of code (unless you fully trust the model owners).
+To run inference, simply add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.

 ```python
 from diffusers import DiffusionPipeline
@@ -273,9 +221,10 @@ video_frames = pipeline(
 ).frames
 ```

-As an additional reference, take a look at the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) which also uses the `trust_remote_code` feature.
+As an additional reference example, you can refer to the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/), that makes use of the `trust_remote_code` feature:

 ```python
+
 from diffusers import DiffusionPipeline
 import torch

@@ -283,4 +232,14 @@ pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/japanese-stable-diffusion-xl", trust_remote_code=True
 )
 pipeline.to("cuda")
+
+# if using torch < 2.0
+# pipeline.enable_xformers_memory_efficient_attention()
+
+prompt = "柴犬、カラフルアート"
+
+image = pipeline(prompt=prompt).images[0]
 ```
+
+> [!TIP]
+> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
@@ -362,12 +362,14 @@ IP-Adapter's image prompting and compatibility with other adapters and models ma

 ### Face model

-Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces from the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) repository:
+Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces:

 * [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds
 * [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces

-Additionally, Diffusers supports all IP-Adapter checkpoints trained with face embeddings extracted by `insightface` face models. Supported models are from the [h94/IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) repository.
+> [!TIP]
+>
+> [IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) is a face-specific IP-Adapter trained with face ID embeddings instead of CLIP image embeddings, allowing you to generate more consistent faces in different contexts and styles. Try out this popular [community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#ip-adapter-face-id) and see how it compares to the other face IP-Adapters.

 For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models.

@@ -409,56 +411,6 @@ image
  </div>
 </div>

-To use IP-Adapter FaceID models, first extract face embeddings with `insightface`. Then pass the list of tensors to the pipeline as `ip_adapter_image_embeds`.
-
-```py
-import torch
-from diffusers import StableDiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image
-from insightface.app import FaceAnalysis
-
-pipeline = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-).to("cuda")
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sd15.bin", image_encoder_folder=None)
-pipeline.set_ip_adapter_scale(0.6)
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
-
-ref_images_embeds = []
-app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
-app.prepare(ctx_id=0, det_size=(640, 640))
-image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
-faces = app.get(image)
-image = torch.from_numpy(faces[0].normed_embedding)
-ref_images_embeds.append(image.unsqueeze(0))
-ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
-neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
-id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda"))
-
-generator = torch.Generator(device="cpu").manual_seed(42)
-
-images = pipeline(
-    prompt="A photo of a girl",
-    ip_adapter_image_embeds=[id_embeds], 
-    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
-    num_inference_steps=20, num_images_per_prompt=1,
-    generator=generator
-).images
-```
-
-Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.
-
-```py
-clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
-
-pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
-pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2
-```
-
-
 ### Multi IP-Adapter

 More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
@@ -10,75 +10,57 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Load pipelines
+# Load pipelines, models, and schedulers

 [[open-in-colab]]

-Diffusion systems consist of multiple components like parameterized models and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API. At the same time, the [`DiffusionPipeline`] is entirely customizable so you can modify each component to build a diffusion system for your use case.
+Having an easy way to use a diffusion system for inference is essential to 🧨 Diffusers. Diffusion systems often consist of multiple components like parameterized models, tokenizers, and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API, while remaining flexible enough to be adapted for other use cases, such as loading each component individually as building blocks to assemble your own diffusion system.
+
+Everything you need for inference or training is accessible with the `from_pretrained()` method.

 This guide will show you how to load:

 - pipelines from the Hub and locally
 - different components into a pipeline
- multiple pipelines without increasing memory usage
 - checkpoint variants such as different floating point types or non-exponential mean averaged (EMA) weights
+- models and schedulers

-## Load a pipeline
+## Diffusion Pipeline

-> [!TIP]
-> Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you're interested in an explanation about how the [`DiffusionPipeline`] class works.
+<Tip>

-There are two ways to load a pipeline for a task:
+💡 Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you are interested in learning in more detail about how the [`DiffusionPipeline`] class works.

-1. Load the generic [`DiffusionPipeline`] class and allow it to automatically detect the correct pipeline class from the checkpoint.
-2. Load a specific pipeline class for a specific task.
+</Tip>

-<hfoptions id="pipelines">
-<hfoption id="generic pipeline">
-
-The [`DiffusionPipeline`] class is a simple and generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). It uses the [`~DiffusionPipeline.from_pretrained`] method to automatically detect the correct pipeline class for a task from the checkpoint, downloads and caches all the required configuration and weight files, and returns a pipeline ready for inference.
+The [`DiffusionPipeline`] class is the simplest and most generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). The [`DiffusionPipeline.from_pretrained`] method automatically detects the correct pipeline class from the checkpoint, downloads, and caches all the required configuration and weight files, and returns a pipeline instance ready for inference.

 ```python
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+repo_id = "runwayml/stable-diffusion-v1-5"
+pipe = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
 ```

-This same checkpoint can also be used for an image-to-image task. The [`DiffusionPipeline`] class can handle any task as long as you provide the appropriate inputs. For example, for an image-to-image task, you need to pass an initial image to the pipeline.
-
-```py
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=init_image).images[0]
-```
-
-</hfoption>
-<hfoption id="specific pipeline">
-
-Checkpoints can be loaded by their specific pipeline class if you already know it. For example, to load a Stable Diffusion model, use the [`StableDiffusionPipeline`] class.
+You can also load a checkpoint with its specific pipeline class. The example above loaded a Stable Diffusion model; to get the same result, use the [`StableDiffusionPipeline`] class:

 ```python
 from diffusers import StableDiffusionPipeline

-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+repo_id = "runwayml/stable-diffusion-v1-5"
+pipe = StableDiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
 ```

-This same checkpoint may also be used for another task like image-to-image. To differentiate what task you want to use the checkpoint for, you have to use the corresponding task-specific pipeline class. For example, to use the same checkpoint for image-to-image, use the [`StableDiffusionImg2ImgPipeline`] class.
+A checkpoint (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) or [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) may also be used for more than one task, like text-to-image or image-to-image. To differentiate what task you want to use the checkpoint for, you have to load it directly with its corresponding task-specific pipeline class:

-```py
+```python
 from diffusers import StableDiffusionImg2ImgPipeline

-pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
+repo_id = "runwayml/stable-diffusion-v1-5"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 ```

-</hfoption>
-</hfoptions>
-
-Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.
+You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:

 <div class="block dark:hidden">
 	<iframe 
@@ -97,307 +79,264 @@ Use the Space below to gauge a pipeline's memory requirements before you downloa

 ### Local pipeline

-To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
+To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:

 ```bash
 git-lfs install
 git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```

-This creates a local folder, ./stable-diffusion-v1-5, on your disk and you should pass its path to [`~DiffusionPipeline.from_pretrained`].
+Then pass the local path to [`~DiffusionPipeline.from_pretrained`]:

 ```python
 from diffusers import DiffusionPipeline

-stable_diffusion = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
+repo_id = "./stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
 ```

-The [`~DiffusionPipeline.from_pretrained`] method won't download files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
+The [`~DiffusionPipeline.from_pretrained`] method won't download any files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.

-## Customize a pipeline
+### Swap components in a pipeline

-You can customize a pipeline by loading different components into it. This is important because you can:
+You can customize the default components of any pipeline with another compatible component. Customization is important because:

- change to a scheduler with faster generation speed or higher generation quality depending on your needs (call the `scheduler.compatibles` method on your pipeline to see compatible schedulers)
- change a default pipeline component to a newer and better performing one
+- Changing the scheduler is important for exploring the trade-off between generation speed and quality.
+- Different components of a model are typically trained independently and you can swap out a component with a better-performing one.
+- During finetuning, usually only some components - like the UNet or text encoder - are trained.

-For example, let's customize the default [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) checkpoint with:
-
- The [`HeunDiscreteScheduler`] to generate higher quality images at the expense of slower generation speed. You must pass the `subfolder="scheduler"` parameter in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler configuration into the correct [subfolder](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/scheduler) of the pipeline repository.
- A more stable VAE that runs in fp16.
+To find out which schedulers are compatible for customization, you can use the `compatibles` method:

 ```py
-from diffusers import StableDiffusionXLPipeline, HeunDiscreteScheduler, AutoencoderKL
-import torch
+from diffusers import DiffusionPipeline

-scheduler = HeunDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+repo_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
+stable_diffusion.scheduler.compatibles
 ```

-Now pass the new scheduler and VAE to the [`StableDiffusionXLPipeline`].
+Let's use the [`SchedulerMixin.from_pretrained`] method to replace the default [`PNDMScheduler`] with a more performant scheduler, [`EulerDiscreteScheduler`]. The `subfolder="scheduler"` argument is required to load the scheduler configuration from the correct [subfolder](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler) of the pipeline repository.

-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0", 
-  scheduler=scheduler, 
-  vae=vae, 
-  torch_dtype=torch.float16, 
-  variant="fp16", 
-  use_safetensors=True
-).to("cuda")
-```
-
-## Reuse a pipeline
-
-When you load multiple pipelines that share the same model components, it makes sense to reuse the shared components instead of reloading everything into memory again, especially if your hardware is memory-constrained. For example:
-
-1. You generated an image with the [`StableDiffusionPipeline`] but you want to improve its quality with the [`StableDiffusionSAGPipeline`]. Both of these pipelines share the same pretrained model, so it'd be a waste of memory to load the same model twice.
-2. You want to add a model component, like a [`MotionAdapter`](../api/pipelines/animatediff#animatediffpipeline), to [`AnimateDiffPipeline`] which was instantiated from an existing [`StableDiffusionPipeline`]. Again, both pipelines share the same pretrained model, so it'd be a waste of memory to load an entirely new pipeline again.
-
-With the [`DiffusionPipeline.from_pipe`] API, you can switch between multiple pipelines to take advantage of their different features without increasing memory-usage. It is similar to turning on and off a feature in your pipeline. To switch between tasks, use the [`~DiffusionPipeline.from_pipe`] method with the [`AutoPipeline`](../api/pipelines/auto_pipeline) class, which automatically identifies the pipeline class based on the task (learn more in the [AutoPipeline](../tutorials/autopipeline) tutorial).
-
-Let's start with a [`StableDiffusionPipeline`] and then reuse the loaded model components to create a [`StableDiffusionSAGPipeline`] to increase generation quality. You'll use the [`StableDiffusionPipeline`] with an [IP-Adapter](./ip_adapter) to generate a bear eating pizza.
+Then you can pass the new [`EulerDiscreteScheduler`] instance to the `scheduler` argument in [`DiffusionPipeline`]:

 ```python
-from diffusers import DiffusionPipeline, StableDiffusionSAGPipeline
-import torch
-import gc
-from diffusers.utils import load_image
-from accelerate.utils import compute_module_sizes
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler

-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
-
-pipe_sd = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", torch_dtype=torch.float16)
-pipe_sd.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
-pipe_sd.set_ip_adapter_scale(0.6)
-pipe_sd.to("cuda")
-
-generator = torch.Generator(device="cpu").manual_seed(33)
-out_sd = pipe_sd(
-    prompt="bear eats pizza",
-    negative_prompt="wrong white balance, dark, sketches,worst quality,low quality", 
-    ip_adapter_image=image,
-    num_inference_steps=50,
-    generator=generator,
-).images[0]
-out_sd
+repo_id = "runwayml/stable-diffusion-v1-5"
+scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler, use_safetensors=True)
 ```

-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_0.png"/>
-</div>
+### Safety checker

-For reference, you can check how much memory this process consumed.
-
-```python
-def bytes_to_giga_bytes(bytes):
-    return bytes / 1024 / 1024 / 1024
-print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
-"Max memory allocated: 4.406213283538818 GB"
-```
-
-Now, reuse the same pipeline components from [`StableDiffusionPipeline`] in [`StableDiffusionSAGPipeline`] with the [`~DiffusionPipeline.from_pipe`] method.
-
-> [!WARNING]
-> Some pipeline methods may not function properly on new pipelines created with [`~DiffusionPipeline.from_pipe`]. For instance, the [`~DiffusionPipeline.enable_model_cpu_offload`] method installs hooks on the model components based on a unique offloading sequence for each pipeline. If the models are executed in a different order in the new pipeline, the CPU offloading may not work correctly.
->
-> To ensure everything works as expected, we recommend re-applying a pipeline method on a new pipeline created with [`~DiffusionPipeline.from_pipe`].
-
-```python
-pipe_sag = StableDiffusionSAGPipeline.from_pipe(
-    pipe_sd
-)
-
-generator = torch.Generator(device="cpu").manual_seed(33)
-out_sag = pipe_sag(
-    prompt="bear eats pizza",
-    negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
-    ip_adapter_image=image,
-    num_inference_steps=50,
-    generator=generator,
-    guidance_scale=1.0,
-    sag_scale=0.75
-).images[0]
-out_sag
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sag_1.png"/>
-</div>
-
-If you check the memory usage, you'll see it remains the same as before because [`StableDiffusionPipeline`] and [`StableDiffusionSAGPipeline`] are sharing the same pipeline components. This allows you to use them interchangeably without any additional memory overhead.
-
-```py
-print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
-"Max memory allocated: 4.406213283538818 GB"
-```
-
-Let's animate the image with the [`AnimateDiffPipeline`] and also add a [`MotionAdapter`] module to the pipeline. For the [`AnimateDiffPipeline`], you need to unload the IP-Adapter first and reload it *after* you've created your new pipeline (this only applies to the [`AnimateDiffPipeline`]).
-
-```py
-from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler
-from diffusers.utils import export_to_gif
-
-pipe_sag.unload_ip_adapter()
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
-
-pipe_animate = AnimateDiffPipeline.from_pipe(pipe_sd, motion_adapter=adapter)
-pipe_animate.scheduler = DDIMScheduler.from_config(pipe_animate.scheduler.config, beta_schedule="linear")
-# load IP-Adapter and LoRA weights again
-pipe_animate.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
-pipe_animate.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
-pipe_animate.to("cuda")
-
-generator = torch.Generator(device="cpu").manual_seed(33)
-pipe_animate.set_adapters("zoom-out", adapter_weights=0.75)
-out = pipe_animate(
-    prompt="bear eats pizza",
-    num_frames=16,
-    num_inference_steps=50,
-    ip_adapter_image=image,
-    generator=generator,
-).frames[0]
-export_to_gif(out, "out_animate.gif")
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_animate_3.gif"/>
-</div>
-
-The [`AnimateDiffPipeline`] is more memory-intensive and consumes 15GB of memory (see the [Memory-usage of from_pipe](#memory-usage-of-from_pipe) section to learn what this means for your memory-usage).
-
-```py
-print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
-"Max memory allocated: 15.178664207458496 GB"
-```
-
-### Modify from_pipe components
-
-Pipelines loaded with [`~DiffusionPipeline.from_pipe`] can be customized with different model components or methods. However, whenever you modify the *state* of the model components, it affects all the other pipelines that share the same components. For example, if you call [`~diffusers.loaders.IPAdapterMixin.unload_ip_adapter`] on the [`StableDiffusionSAGPipeline`], you won't be able to use IP-Adapter with the [`StableDiffusionPipeline`] because it's been removed from their shared components.
-
-```py
-pipe.sag_unload_ip_adapter()
-
-generator = torch.Generator(device="cpu").manual_seed(33)
-out_sd = pipe_sd(
-    prompt="bear eats pizza",
-    negative_prompt="wrong white balance, dark, sketches,worst quality,low quality", 
-    ip_adapter_image=image,
-    num_inference_steps=50,
-    generator=generator,
-).images[0]
-"AttributeError: 'NoneType' object has no attribute 'image_projection_layers'"
-```
-
-### Memory usage of from_pipe
-
-The memory requirement of loading multiple pipelines with [`~DiffusionPipeline.from_pipe`] is determined by the pipeline with the highest memory-usage regardless of the number of pipelines you create.
-
-| Pipeline | Memory usage (GB) |
-|---|---|
-| StableDiffusionPipeline | 4.400 |
-| StableDiffusionSAGPipeline | 4.400 |
-| AnimateDiffPipeline | 15.178 |
-
-The [`AnimateDiffPipeline`] has the highest memory requirement, so the *total memory-usage* is based only on the [`AnimateDiffPipeline`]. Your memory-usage will not increase if you create additional pipelines as long as their memory requirements doesn't exceed that of the [`AnimateDiffPipeline`]. Each pipeline can be used interchangeably without any additional memory overhead.
-
-## Safety checker
-
-Diffusers implements a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) for Stable Diffusion models which can generate harmful content. The safety checker screens the generated output against known hardcoded not-safe-for-work (NSFW) content. If for whatever reason you'd like to disable the safety checker, pass `safety_checker=None` to the [`~DiffusionPipeline.from_pretrained`] method.
+Diffusion models like Stable Diffusion can generate harmful content, which is why 🧨 Diffusers has a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to check generated outputs against known hardcoded NSFW content. If you'd like to disable the safety checker for whatever reason, pass `None` to the `safety_checker` argument:

 ```python
 from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None, use_safetensors=True)
+repo_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None, use_safetensors=True)
 """
 You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide by the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend keeping the safety filter enabled in all public-facing circumstances, disabling it only for use cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
 """
 ```

+### Reuse components across pipelines
+
+You can also reuse the same components in multiple pipelines to avoid loading the weights into RAM twice. Use the [`~DiffusionPipeline.components`] method to save the components:
+
+```python
+from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+
+components = stable_diffusion_txt2img.components
+```
+
+Then you can pass the `components` to another pipeline without reloading the weights into RAM:
+
+```py
+stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(**components)
+```
+
+You can also pass the components individually to the pipeline if you want more flexibility over which components to reuse or disable. For example, to reuse the same components in the text-to-image pipeline, except for the safety checker and feature extractor, in the image-to-image pipeline:
+
+```py
+from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(
+    vae=stable_diffusion_txt2img.vae,
+    text_encoder=stable_diffusion_txt2img.text_encoder,
+    tokenizer=stable_diffusion_txt2img.tokenizer,
+    unet=stable_diffusion_txt2img.unet,
+    scheduler=stable_diffusion_txt2img.scheduler,
+    safety_checker=None,
+    feature_extractor=None,
+    requires_safety_checker=False,
+)
+```
+
 ## Checkpoint variants

 A checkpoint variant is usually a checkpoint whose weights are:

- Stored in a different floating point type, such as [torch.float16](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
- Non-exponential mean averaged (EMA) weights which shouldn't be used for inference. You should use this variant to continue finetuning a model.
+- Stored in a different floating point type for lower precision and lower storage, such as [`torch.float16`](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
+- Non-exponential mean averaged (EMA) weights, which shouldn't be used for inference. You should use these to continue fine-tuning a model.

-> [!TIP]
-> When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories. For example, [stabilityai/stable-diffusion-2](https://hf.co/stabilityai/stable-diffusion-2) and [stabilityai/stable-diffusion-2-1](https://hf.co/stabilityai/stable-diffusion-2-1) are stored in separate repositories.
+<Tip>

-Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [safetensors](./using_safetensors)), model structure, and their weights have identical tensor shapes.
+💡 When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories instead of variations (for example, [`stable-diffusion-v1-4`] and [`stable-diffusion-v1-5`]).

-| **checkpoint type** | **weight name**                             | **argument for loading weights** |
-|---------------------|---------------------------------------------|----------------------------------|
-| original            | diffusion_pytorch_model.safetensors         |                                  |
-| floating point      | diffusion_pytorch_model.fp16.safetensors    | `variant`, `torch_dtype`         |
-| non-EMA             | diffusion_pytorch_model.non_ema.safetensors | `variant`                        |
+</Tip>

-There are two important arguments for loading variants:
+Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [Safetensors](./using_safetensors)), model structure, and weights that have identical tensor shapes.

- `torch_dtype` specifies the floating point precision of the loaded checkpoint. For example, if you want to save bandwidth by loading a fp16 variant, you should set `variant="fp16"` and `torch_dtype=torch.float16` to *convert the weights* to fp16. Otherwise, the fp16 weights are converted to the default fp32 precision.
+| **checkpoint type** | **weight name**                     | **argument for loading weights** |
+|---------------------|-------------------------------------|----------------------------------|
+| original            | diffusion_pytorch_model.bin         |                                  |
+| floating point      | diffusion_pytorch_model.fp16.bin    | `variant`, `torch_dtype`         |
+| non-EMA             | diffusion_pytorch_model.non_ema.bin | `variant`                        |

-  If you only set `torch_dtype=torch.float16`, the default fp32 weights are downloaded first and then converted to fp16.
+There are two important arguments to know for loading variants:

- `variant` specifies which files should be loaded from the repository. For example, if you want to load a non-EMA variant of a UNet from [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main/unet), set `variant="non_ema"` to download the `non_ema` file.
+- `torch_dtype` defines the floating point precision of the loaded checkpoints. For example, if you want to save bandwidth by loading a `fp16` variant, you should specify `torch_dtype=torch.float16` to *convert the weights* to `fp16`. Otherwise, the `fp16` weights are converted to the default `fp32` precision. You can also load the original checkpoint without defining the `variant` argument, and convert it to `fp16` with `torch_dtype=torch.float16`. In this case, the default `fp32` weights are downloaded first, and then they're converted to `fp16` after loading.

-<hfoptions id="variants">
-<hfoption id="fp16">
+- `variant` defines which files should be loaded from the repository. For example, if you want to load a `non_ema` variant from the [`diffusers/stable-diffusion-variants`](https://huggingface.co/diffusers/stable-diffusion-variants/tree/main/unet) repository, you should specify `variant="non_ema"` to download the `non_ema` files.

-```py
+```python
 from diffusers import DiffusionPipeline
 import torch

-pipeline = DiffusionPipeline.from_pretrained(
+# load fp16 variant
+stable_diffusion = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
 )
-```
-
-</hfoption>
-<hfoption id="non-EMA">
-
-```py
-pipeline = DiffusionPipeline.from_pretrained(
+# load non_ema variant
+stable_diffusion = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", variant="non_ema", use_safetensors=True
 )
 ```

-</hfoption>
-</hfoptions>
-
-Use the `variant` parameter in the [`DiffusionPipeline.save_pretrained`] method to save a checkpoint as a different floating point type or as a non-EMA variant. You should try save a variant to the same folder as the original checkpoint, so you have the option of loading both from the same folder.
-
-<hfoptions id="save">
-<hfoption id="fp16">
+To save a checkpoint stored in a different floating-point type or as a non-EMA variant, use the [`DiffusionPipeline.save_pretrained`] method and specify the `variant` argument. You should try and save a variant to the same folder as the original checkpoint, so you can load both from the same folder:

 ```python
 from diffusers import DiffusionPipeline

-pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
+# save as fp16 variant
+stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
+# save as non-ema variant
+stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
 ```

-</hfoption>
-<hfoption id="non_ema">
-
-```py
-pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
-```
-
-</hfoption>
-</hfoptions>
-
-If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint.
+If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint:

 ```python
 # 👎 this won't work
-pipeline = DiffusionPipeline.from_pretrained(
+stable_diffusion = DiffusionPipeline.from_pretrained(
    "./stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
 )
 # 👍 this works
-pipeline = DiffusionPipeline.from_pretrained(
+stable_diffusion = DiffusionPipeline.from_pretrained(
    "./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
 )
 ```

+<!--
+TODO(Patrick) - Make sure to uncomment this part as soon as things are deprecated.
+
+#### Using `revision` to load pipeline variants is deprecated
+
+Previously the `revision` argument of [`DiffusionPipeline.from_pretrained`] was heavily used to
+load model variants, e.g.:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", use_safetensors=True)
+```
+
+However, this behavior is now deprecated since the "revision" argument should (just as it's done in GitHub) better be used to load model checkpoints from a specific commit or branch in development.
+
+The above example is therefore deprecated and won't be supported anymore for `diffusers >= 1.0.0`.
+
+<Tip warning={true}>
+
+If you load diffusers pipelines or models with `revision="fp16"` or `revision="non_ema"`,
+please make sure to update the code and use `variant="fp16"` or `variation="non_ema"` respectively
+instead.
+
+</Tip>
+-->
+
+## Models
+
+Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
+
+Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for `runwayml/stable-diffusion-v1-5` are stored in the [`unet`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/unet) subfolder:
+
+```python
+from diffusers import UNet2DConditionModel
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet", use_safetensors=True)
+```
+
+Or directly from a repository's [directory](https://huggingface.co/google/ddpm-cifar10-32/tree/main):
+
+```python
+from diffusers import UNet2DModel
+
+repo_id = "google/ddpm-cifar10-32"
+model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
+```
+
+You can also load and save model variants by specifying the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`]:
+
+```python
+from diffusers import UNet2DConditionModel
+
+model = UNet2DConditionModel.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
+)
+model.save_pretrained("./local-unet", variant="non_ema")
+```
+
+## Schedulers
+
+Schedulers are loaded from the [`SchedulerMixin.from_pretrained`] method, and unlike models, schedulers are **not parameterized** or **trained**; they are defined by a configuration file.
+
+Loading schedulers does not consume any significant amount of memory and the same configuration file can be used for a variety of different schedulers.
+For example, the following schedulers are compatible with [`StableDiffusionPipeline`], which means you can load the same scheduler configuration file in any of these classes:
+
+```python
+from diffusers import StableDiffusionPipeline
+from diffusers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+
+ddpm = DDPMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+ddim = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+pndm = PNDMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+lms = LMSDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+euler_anc = EulerAncestralDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
+
+# replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler_anc`, `euler`
+pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm, use_safetensors=True)
+```
+
 ## DiffusionPipeline explained

 As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
@@ -320,40 +320,3 @@ pipeline = AutoPipelineForText2Image.from_pretrained(

 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
 ```
-
-### IP-Adapter Face ID models
-
-The IP-Adapter FaceID models are experimental IP Adapters that use image embeddings generated by `insightface` instead of CLIP image embeddings. Some of these models also use LoRA to improve ID consistency.
-You need to install `insightface` and all its requirements to use these models.
-
-<Tip warning={true}>
-As InsightFace pretrained models are available for non-commercial research purposes, IP-Adapter-FaceID models are released exclusively for research purposes and are not intended for commercial use.
-</Tip>
-
-```py
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-
-pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sdxl.bin", image_encoder_folder=None)
-```
-
-If you want to use one of the two IP-Adapter FaceID Plus models, you must also load the CLIP image encoder, as this models use both `insightface` and CLIP image embeddings to achieve better photorealism.
-
-```py
-from transformers import CLIPVisionModelWithProjection
-
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
-    torch_dtype=torch.float16,
-)
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16
-).to("cuda")
-
-pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin")
-```
@@ -0,0 +1,17 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Overview
+
+🧨 Diffusers offers many pipelines, models, and schedulers for generative tasks. To make loading these components as simple as possible, we provide a single and unified method - `from_pretrained()` - that loads any of these components from either the Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) or your local machine. Whenever you load a pipeline or model, the latest files are automatically downloaded and cached so you can quickly reuse them next time without redownloading the files.
+
+This section will show you everything you need to know about loading pipelines, how to load different components in a pipeline, how to load checkpoint variants, and how to load community pipelines. You'll also learn how to load schedulers and compare the speed and quality trade-offs of using different schedulers. Finally, you'll see how to convert and load KerasCV checkpoints so you can use them in PyTorch with 🧨 Diffusers.
@@ -10,27 +10,57 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Load schedulers and models
+# Schedulers

 [[open-in-colab]]

-Diffusion pipelines are a collection of interchangeable schedulers and models that can be mixed and matched to tailor a pipeline to a specific use case. The scheduler encapsulates the entire denoising process such as the number of denoising steps and the algorithm for finding the denoised sample. A scheduler is not parameterized or trained so they don't take very much memory. The model is usually only concerned with the forward pass of going from a noisy input to a less noisy sample.
+Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize
+a pipeline to one's use case. The best example of this is the [Schedulers](../api/schedulers/overview).

-This guide will show you how to load schedulers and models to customize a pipeline. You'll use the [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5) checkpoint throughout this guide, so let's load it first.
+Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample,
+schedulers define the whole denoising process, *i.e.*:
+- How many denoising steps?
+- Stochastic or deterministic?
+- What algorithm to use to find the denoised sample?

-```py
-import torch
+They can be quite complex and often define a trade-off between **denoising speed** and **denoising quality**.
+It is extremely difficult to measure quantitatively which scheduler works best for a given diffusion pipeline, so it is often recommended to simply try out which works best.
+
+The following paragraphs show how to do so with the 🧨 Diffusers library.
+
+## Load pipeline
+
+Let's start by loading the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) model in the [`DiffusionPipeline`]:
+
+```python
+from huggingface_hub import login
 from diffusers import DiffusionPipeline
+import torch
+
+login()

 pipeline = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
+)
 ```

-You can see what scheduler this pipeline uses with the `pipeline.scheduler` attribute.
+Next, we move it to GPU:

-```py
+```python
+pipeline.to("cuda")
+```
+
+## Access the scheduler
+
+The scheduler is always one of the components of the pipeline and is usually called `"scheduler"`.
+So it can be accessed via the `"scheduler"` property.
+
+```python
 pipeline.scheduler
+```
+
+**Output**:
+```
 PNDMScheduler {
  "_class_name": "PNDMScheduler",
  "_diffusers_version": "0.21.4",
@@ -47,156 +77,235 @@ PNDMScheduler {
 }
 ```

-## Load a scheduler
-
-Schedulers are defined by a configuration file that can be used by a variety of schedulers. Load a scheduler with the [`SchedulerMixin.from_pretrained`] method, and specify the `subfolder` parameter to load the configuration file into the correct subfolder of the pipeline repository.
-
-For example, to load the [`DDIMScheduler`]:
-
-```py
-from diffusers import DDIMScheduler, DiffusionPipeline
-
-ddim = DDIMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
-```
-
-Then you can pass the newly loaded scheduler to the pipeline.
+We can see that the scheduler is of type [`PNDMScheduler`].
+Cool, now let's compare the scheduler in its performance to other schedulers.
+First we define a prompt on which we will test all the different schedulers:

 ```python
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", scheduler=ddim, torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
+prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
 ```

+Next, we create a generator from a random seed that will ensure that we can generate similar images as well as run the pipeline:
+
+```python
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_pndm.png" width="400"/>
+    <br>
+</p>
+
+
+## Changing the scheduler
+
+Now we show how easy it is to change the scheduler of a pipeline. Every scheduler has a property [`~SchedulerMixin.compatibles`]
+which defines all compatible schedulers. You can take a look at all available, compatible schedulers for the Stable Diffusion pipeline as follows.
+
+```python
+pipeline.scheduler.compatibles
+```
+
+**Output**:
+```
+[diffusers.utils.dummy_torch_and_torchsde_objects.DPMSolverSDEScheduler,
+ diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
+ diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
+ diffusers.schedulers.scheduling_ddim.DDIMScheduler,
+ diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
+ diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
+ diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
+ diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler,
+ diffusers.schedulers.scheduling_pndm.PNDMScheduler,
+ diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
+ diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler,
+ diffusers.schedulers.scheduling_k_dpm_2_discrete.KDPM2DiscreteScheduler,
+ diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
+ diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete.KDPM2AncestralDiscreteScheduler]
+```
+
+Cool, lots of schedulers to look at. Feel free to have a look at their respective class definitions:
+
+- [`EulerDiscreteScheduler`],
+- [`LMSDiscreteScheduler`],
+- [`DDIMScheduler`],
+- [`DDPMScheduler`],
+- [`HeunDiscreteScheduler`],
+- [`DPMSolverMultistepScheduler`],
+- [`DEISMultistepScheduler`],
+- [`PNDMScheduler`],
+- [`EulerAncestralDiscreteScheduler`],
+- [`UniPCMultistepScheduler`],
+- [`KDPM2DiscreteScheduler`],
+- [`DPMSolverSinglestepScheduler`],
+- [`KDPM2AncestralDiscreteScheduler`].
+
+We will now compare the input prompt with all other schedulers. To change the scheduler of the pipeline you can make use of the
+convenient [`~ConfigMixin.config`] property in combination with the [`~ConfigMixin.from_config`] function.
+
+```python
+pipeline.scheduler.config
+```
+
+returns a dictionary of the configuration of the scheduler:
+
+**Output**:
+```py
+FrozenDict([('num_train_timesteps', 1000),
+            ('beta_start', 0.00085),
+            ('beta_end', 0.012),
+            ('beta_schedule', 'scaled_linear'),
+            ('trained_betas', None),
+            ('skip_prk_steps', True),
+            ('set_alpha_to_one', False),
+            ('prediction_type', 'epsilon'),
+            ('timestep_spacing', 'leading'),
+            ('steps_offset', 1),
+            ('_use_default_values', ['timestep_spacing', 'prediction_type']),
+            ('_class_name', 'PNDMScheduler'),
+            ('_diffusers_version', '0.21.4'),
+            ('clip_sample', False)])
+```
+
+This configuration can then be used to instantiate a scheduler
+of a different class that is compatible with the pipeline. Here,
+we change the scheduler to the [`DDIMScheduler`].
+
+```python
+from diffusers import DDIMScheduler
+
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+```
+
+Cool, now we can run the pipeline again to compare the generation quality.
+
+```python
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_ddim.png" width="400"/>
+    <br>
+</p>
+
+If you are a JAX/Flax user, please check [this section](#changing-the-scheduler-in-flax) instead.
+
 ## Compare schedulers

-Schedulers have their own unique strengths and weaknesses, making it difficult to quantitatively compare which scheduler works best for a pipeline. You typically have to make a trade-off between denoising speed and denoising quality. We recommend trying out different schedulers to find one that works best for your use case. Call the `pipeline.scheduler.compatibles` attribute to see what schedulers are compatible with a pipeline.
+So far we have tried running the stable diffusion pipeline with two schedulers: [`PNDMScheduler`] and [`DDIMScheduler`].
+A number of better schedulers have been released that can be run with much fewer steps; let's compare them here:

-Let's compare the [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], and the [`DPMSolverMultistepScheduler`] on the following prompt and seed.
+[`LMSDiscreteScheduler`] usually leads to better results:

-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-).to("cuda")
-
-prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
-generator = torch.Generator(device="cuda").manual_seed(8)
-```
-
-To change the pipelines scheduler, use the [`~ConfigMixin.from_config`] method to load a different scheduler's `pipeline.scheduler.config` into the pipeline.
-
-<hfoptions id="schedulers">
-<hfoption id="LMSDiscreteScheduler">
-
-[`LMSDiscreteScheduler`] typically generates higher quality images than the default scheduler.
-
-```py
+```python
 from diffusers import LMSDiscreteScheduler

 pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.Generator(device="cuda").manual_seed(8)
 image = pipeline(prompt, generator=generator).images[0]
 image
 ```

-</hfoption>
-<hfoption id="EulerDiscreteScheduler">
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" width="400"/>
+    <br>
+</p>

-[`EulerDiscreteScheduler`] can generate higher quality images in just 30 steps.

-```py
+[`EulerDiscreteScheduler`] and [`EulerAncestralDiscreteScheduler`] can generate high quality results with as little as 30 steps.
+
+```python
 from diffusers import EulerDiscreteScheduler

 pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-image = pipeline(prompt, generator=generator).images[0]
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
 image
 ```

-</hfoption>
-<hfoption id="EulerAncestralDiscreteScheduler">
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" width="400"/>
+    <br>
+</p>

-[`EulerAncestralDiscreteScheduler`] can generate higher quality images in just 30 steps.

-```py
+and:
+
+```python
 from diffusers import EulerAncestralDiscreteScheduler

 pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
-image = pipeline(prompt, generator=generator).images[0]
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
 image
 ```

-</hfoption>
-<hfoption id="DPMSolverMultistepScheduler">
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" width="400"/>
+    <br>
+</p>

-[`DPMSolverMultistepScheduler`] provides a balance between speed and quality and can generate higher quality images in just 20 steps.

-```py
+[`DPMSolverMultistepScheduler`] gives a reasonable speed/quality trade-off and can be run with as little as 20 steps.
+
+```python
 from diffusers import DPMSolverMultistepScheduler

 pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-image = pipeline(prompt, generator=generator).images[0]
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
 image
 ```

-</hfoption>
-</hfoptions>
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" width="400"/>
+    <br>
+</p>

-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">LMSDiscreteScheduler</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">EulerDiscreteScheduler</figcaption>
-  </div>
-</div>
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">EulerAncestralDiscreteScheduler</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">DPMSolverMultistepScheduler</figcaption>
-  </div>
-</div>
+As you can see, most images look very similar and are arguably of very similar quality. It often really depends on the specific use case which scheduler to choose. A good approach is always to run multiple different
+schedulers to compare results.

-Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results.
+## Changing the Scheduler in Flax

-### Flax schedulers
+If you are a JAX/Flax user, you can also change the default pipeline scheduler. This is a complete example of how to run inference using the Flax Stable Diffusion pipeline and the super-fast [DPM-Solver++ scheduler](../api/schedulers/multistep_dpm_solver):

-To compare Flax schedulers, you need to additionally load the scheduler state into the model parameters. For example, let's change the default scheduler in [`FlaxStableDiffusionPipeline`] to use the super fast [`FlaxDPMSolverMultistepScheduler`].
-
-> [!WARNING]
-> The [`FlaxLMSDiscreteScheduler`] and [`FlaxDDPMScheduler`] are not compatible with the [`FlaxStableDiffusionPipeline`] yet.
-
-```py
+```Python
 import jax
 import numpy as np
 from flax.jax_utils import replicate
 from flax.training.common_utils import shard
+
 from diffusers import FlaxStableDiffusionPipeline, FlaxDPMSolverMultistepScheduler

+model_id = "runwayml/stable-diffusion-v1-5"
 scheduler, scheduler_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
+    model_id,
    subfolder="scheduler"
 )
 pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
+    model_id,
    scheduler=scheduler,
    revision="bf16",
    dtype=jax.numpy.bfloat16,
 )
 params["scheduler"] = scheduler_state
-```

-Then you can take advantage of Flax's compatibility with TPUs to generate a number of images in parallel. You'll need to make a copy of the model parameters for each available device and then split the inputs across them to generate your desired number of images.
-
-```py
 # Generate 1 image per parallel device (8 on TPUv2-8 or TPUv3-8)
-prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
+prompt = "a photo of an astronaut riding a horse on mars"
 num_samples = jax.device_count()
 prompt_ids = pipeline.prepare_inputs([prompt] * num_samples)

@@ -212,33 +321,11 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```

-## Models
+<Tip warning={true}>

-Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
+The following Flax schedulers are _not yet compatible_ with the Flax Stable Diffusion Pipeline:

-Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5) are stored in the [unet](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main/unet) subfolder.
+- `FlaxLMSDiscreteScheduler`
+- `FlaxDDPMScheduler`

-```python
-from diffusers import UNet2DConditionModel
-
-unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", use_safetensors=True)
-```
-
-They can also be directly loaded from a [repository](https://huggingface.co/google/ddpm-cifar10-32/tree/main).
-
-```python
-from diffusers import UNet2DModel
-
-unet = UNet2DModel.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-```
-
-To load and save model variants, specify the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`].
-
-```python
-from diffusers import UNet2DConditionModel
-
-unet = UNet2DConditionModel.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
-)
-unet.save_pretrained("./local-unet", variant="non_ema")
-```
+</Tip>
@@ -1,219 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# T2I-Adapter
-
-[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter for controlling and providing more accurate
-structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the
-text-to-image model and an external control signal, such as edge detection or depth estimation.
-
-The T2I-Adapter design is simple, the condition is passed to four feature extraction blocks and three downsample
-blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the
-text-to-image model. T2I-Adapter is similar to [ControlNet](controlnet) except it is smaller (~77M parameters) and
-faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse
-than ControlNet.
-
-This guide will show you how to use T2I-Adapter with different Stable Diffusion models and how you can compose multiple
-T2I-Adapters to impose more than one condition.
-
-> [!TIP]
-> There are several T2I-Adapters available for different conditions, such as color palette, depth, sketch, pose, and
-> segmentation. Check out the [TencentARC](https://hf.co/TencentARC) repository to try them out!
-
-Before you begin, make sure you have the following libraries installed.
-
-```py
-# uncomment to install the necessary libraries in Colab
-#!pip install -q diffusers accelerate controlnet-aux==0.0.7
-```
-
-## Text-to-image
-
-Text-to-image models rely on a prompt to generate an image, but sometimes, text alone may not be enough to provide more
-accurate structural guidance. T2I-Adapter allows you to provide an additional control image to guide the generation
-process. For example, you can provide a canny image (a white outline of an image on a black background) to guide the
-model to generate an image with a similar structure.
-
-<hfoptions id="stablediffusion">
-<hfoption id="Stable Diffusion 1.5">
-
-Create a canny image with the [opencv-library](https://github.com/opencv/opencv-python).
-
-```py
-import cv2
-import numpy as np
-from PIL import Image
-from diffusers.utils import load_image
-
-image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
-image = np.array(image)
-
-low_threshold = 100
-high_threshold = 200
-
-image = cv2.Canny(image, low_threshold, high_threshold)
-image = Image.fromarray(image)
-```
-
-Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2iadapter_canny_sd15v2) and pass it to
-the [`StableDiffusionAdapterPipeline`].
-
-```py
-import torch
-from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
-
-adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_canny_sd15v2", torch_dtype=torch.float16)
-pipeline = StableDiffusionAdapterPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    adapter=adapter,
-    torch_dtype=torch.float16,
-)
-pipeline.to("cuda")
-```
-
-Finally, pass your prompt and control image to the pipeline.
-
-```py
-generator = torch.Generator("cuda").manual_seed(0)
-
-image = pipeline(
-    prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
-    image=image,
-    generator=generator,
-).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sd1.5.png"/>
-</div>
-
-</hfoption>
-<hfoption id="Stable Diffusion XL">
-
-Create a canny image with the [controlnet-aux](https://github.com/huggingface/controlnet_aux) library.
-
-```py
-from controlnet_aux.canny import CannyDetector
-from diffusers.utils import load_image
-
-canny_detector = CannyDetector()
-
-image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
-image = canny_detector(image, detect_resolution=384, image_resolution=1024)
-```
-
-Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2i-adapter-canny-sdxl-1.0) and pass it
-to the [`StableDiffusionXLAdapterPipeline`].
-
-```py
-import torch
-from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
-
-scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    adapter=adapter,
-    vae=vae,
-    scheduler=scheduler,
-    torch_dtype=torch.float16,
-    variant="fp16",
-)
-pipeline.to("cuda")
-```
-
-Finally, pass your prompt and control image to the pipeline.
-
-```py
-generator = torch.Generator("cuda").manual_seed(0)
-
-image = pipeline(
-  prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
-  image=image,
-  generator=generator,
-).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sdxl.png"/>
-</div>
-
-</hfoption>
-</hfoptions>
-
-## MultiAdapter
-
-T2I-Adapters are also composable, allowing you to use more than one adapter to impose multiple control conditions on an
-image. For example, you can use a pose map to provide structural control and a depth map for depth control. This is
-enabled by the [`MultiAdapter`] class.
-
-Let's condition a text-to-image model with a pose and depth adapter. Create and place your depth and pose image and in a list.
-
-```py
-from diffusers.utils import load_image
-
-pose_image = load_image(
-    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
-)
-depth_image = load_image(
-    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
-)
-cond = [pose_image, depth_image]
-prompt = ["Santa Claus walking into an office room with a beautiful city view"]
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">pose image</figcaption>
-  </div>
-</div>
-
-Load the corresponding pose and depth adapters as a list in the [`MultiAdapter`] class.
-
-```py
-import torch
-from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
-
-adapters = MultiAdapter(
-    [
-        T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
-        T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
-    ]
-)
-adapters = adapters.to(torch.float16)
-```
-
-Finally, load a [`StableDiffusionAdapterPipeline`] with the adapters, and pass your prompt and conditioned images to
-it. Use the [`adapter_conditioning_scale`] to adjust the weight of each adapter on the image.
-
-```py
-pipeline = StableDiffusionAdapterPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    torch_dtype=torch.float16,
-    adapter=adapters,
-).to("cuda")
-
-image = pipeline(prompt, cond, adapter_conditioning_scale=[0.7, 0.7]).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi.png"/>
-</div>
@@ -10,209 +10,10 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Prompt techniques
+# Prompt weighting

 [[open-in-colab]]

-Prompts are important because they describe what you want a diffusion model to generate. The best prompts are detailed, specific, and well-structured to help the model realize your vision. But crafting a great prompt takes time and effort and sometimes it may not be enough because language and words can be imprecise. This is where you need to boost your prompt with other techniques, such as prompt enhancing and prompt weighting, to get the results you want.
-
-This guide will show you how you can use these prompt techniques to generate high-quality images with lower effort and adjust the weight of certain keywords in a prompt.
-
-## Prompt engineering
-
-> [!TIP]
-> This is not an exhaustive guide on prompt engineering, but it will help you understand the necessary parts of a good prompt. We encourage you to continue experimenting with different prompts and combine them in new ways to see what works best. As you write more prompts, you'll develop an intuition for what works and what doesn't!
-
-New diffusion models do a pretty good job of generating high-quality images from a basic prompt, but it is still important to create a well-written prompt to get the best results. Here are a few tips for writing a good prompt:
-
-1. What is the image *medium*? Is it a photo, a painting, a 3D illustration, or something else?
-2. What is the image *subject*? Is it a person, animal, object, or scene?
-3. What *details* would you like to see in the image? This is where you can get really creative and have a lot of fun experimenting with different words to bring your image to life. For example, what is the lighting like? What is the vibe and aesthetic? What kind of art or illustration style are you looking for? The more specific and precise words you use, the better the model will understand what you want to generate.
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/plain-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"A photo of a banana-shaped couch in a living room"</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the windows"</figcaption>
-  </div>
-</div>
-
-## Prompt enhancing with GPT2
-
-Prompt enhancing is a technique for quickly improving prompt quality without spending too much effort constructing one. It uses a model like GPT2 pretrained on Stable Diffusion text prompts to automatically enrich a prompt with additional important keywords to generate high-quality images.
-
-The technique works by curating a list of specific keywords and forcing the model to generate those words to enhance the original prompt. This way, your prompt can be "a cat" and GPT2 can enhance the prompt to "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic".
-
-> [!TIP]
-> You should also use a [*offset noise*](https://www.crosslabs.org//blog/diffusion-with-offset-noise) LoRA to improve the contrast in bright and dark images and create better lighting overall. This [LoRA](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors) is available from [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0).
-
-Start by defining certain styles and a list of words (you can check out a more comprehensive list of [words](https://hf.co/LykosAI/GPT-Prompt-Expansion-Fooocus-v2/blob/main/positive.txt) and [styles](https://github.com/lllyasviel/Fooocus/tree/main/sdxl_styles) used by Fooocus) to enhance a prompt with.
-
-```py
-import torch
-from transformers import GenerationConfig, GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor, LogitsProcessorList
-from diffusers import StableDiffusionXLPipeline
-
-styles = {
-    "cinematic": "cinematic film still of {prompt}, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "anime": "anime artwork of {prompt}, anime style, key visual, vibrant, studio anime, highly detailed",
-    "photographic": "cinematic photo of {prompt}, 35mm photograph, film, professional, 4k, highly detailed",
-    "comic": "comic of {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
-    "lineart": "line art drawing {prompt}, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
-    "pixelart": " pixel-art {prompt}, low-res, blocky, pixel art style, 8-bit graphics",
-}
-
-words = [
-    "aesthetic", "astonishing", "beautiful", "breathtaking", "composition", "contrasted", "epic", "moody", "enhanced",
-    "exceptional", "fascinating", "flawless", "glamorous", "glorious", "illumination", "impressive", "improved",
-    "inspirational", "magnificent", "majestic", "hyperrealistic", "smooth", "sharp", "focus", "stunning", "detailed",
-    "intricate", "dramatic", "high", "quality", "perfect", "light", "ultra", "highly", "radiant", "satisfying",
-    "soothing", "sophisticated", "stylish", "sublime", "terrific", "touching", "timeless", "wonderful", "unbelievable",
-    "elegant", "awesome", "amazing", "dynamic", "trendy",
-]
-```
-
-You may have noticed in the `words` list, there are certain words that can be paired together to create something more meaningful. For example, the words "high" and "quality" can be combined to create "high quality". Let's pair these words together and remove the words that can't be paired.
-
-```py
-word_pairs = ["highly detailed", "high quality", "enhanced quality", "perfect composition", "dynamic light"]
-
-def find_and_order_pairs(s, pairs):
-    words = s.split()
-    found_pairs = []
-    for pair in pairs:
-        pair_words = pair.split()
-        if pair_words[0] in words and pair_words[1] in words:
-            found_pairs.append(pair)
-            words.remove(pair_words[0])
-            words.remove(pair_words[1])
-
-    for word in words[:]:
-        for pair in pairs:
-            if word in pair.split():
-                words.remove(word)
-                break
-    ordered_pairs = ", ".join(found_pairs)
-    remaining_s = ", ".join(words)
-    return ordered_pairs, remaining_s
-```
-
-Next, implement a custom [`~transformers.LogitsProcessor`] class that assigns tokens in the `words` list a value of 0 and assigns tokens not in the `words` list a negative value so they aren't picked during generation. This way, generation is biased towards words in the `words` list. After a word from the list is used, it is also assigned a negative value so it isn't picked again.
-
-```py
-class CustomLogitsProcessor(LogitsProcessor):
-    def __init__(self, bias):
-        super().__init__()
-        self.bias = bias
-
-    def __call__(self, input_ids, scores):
-        if len(input_ids.shape) == 2:
-            last_token_id = input_ids[0, -1]
-            self.bias[last_token_id] = -1e10
-        return scores + self.bias
-
-word_ids = [tokenizer.encode(word, add_prefix_space=True)[0] for word in words]
-bias = torch.full((tokenizer.vocab_size,), -float("Inf")).to("cuda")
-bias[word_ids] = 0
-processor = CustomLogitsProcessor(bias)
-processor_list = LogitsProcessorList([processor])
-```
-
-Combine the prompt and the `cinematic` style prompt defined in the `styles` dictionary earlier.
-
-```py
-prompt = "a cat basking in the sun on a roof in Turkey"
-style = "cinematic"
-
-prompt = styles[style].format(prompt=prompt)
-prompt
-"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"
-```
-
-Load a GPT2 tokenizer and model from the [Gustavosta/MagicPrompt-Stable-Diffusion](https://huggingface.co/Gustavosta/MagicPrompt-Stable-Diffusion) checkpoint (this specific checkpoint is trained to generate prompts) to enhance the prompt.
-
-```py
-tokenizer = GPT2Tokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
-model = GPT2LMHeadModel.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16).to(
-    "cuda"
-)
-model.eval()
-
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-token_count = inputs["input_ids"].shape[1]
-max_new_tokens = 50 - token_count
-
-generation_config = GenerationConfig(
-    penalty_alpha=0.7,
-    top_k=50,
-    eos_token_id=model.config.eos_token_id,
-    pad_token_id=model.config.eos_token_id,
-    pad_token=model.config.pad_token_id,
-    do_sample=True,
-)
-
-with torch.no_grad():
-    generated_ids = model.generate(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],
-        max_new_tokens=max_new_tokens,
-        generation_config=generation_config,
-        logits_processor=proccesor_list,
-    )
-```
-
-Then you can combine the input prompt and the generated prompt. Feel free to take a look at what the generated prompt (`generated_part`) is, the word pairs that were found (`pairs`), and the remaining words (`words`). This is all packed together in the `enhanced_prompt`.
-
-```py
-output_tokens = [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in generated_ids]
-input_part, generated_part = output_tokens[0][: len(prompt)], output_tokens[0][len(prompt) :]
-pairs, words = find_and_order_pairs(generated_part, word_pairs)
-formatted_generated_part = pairs + ", " + words
-enhanced_prompt = input_part + ", " + formatted_generated_part
-enhanced_prompt
-["cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic"]
-```
-
-Finally, load a pipeline and the offset noise LoRA with a *low weight* to generate an image with the enhanced prompt.
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
-
-pipeline.load_lora_weights(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    weight_name="sd_xl_offset_example-lora_1.0.safetensors",
-    adapter_name="offset",
-)
-pipeline.set_adapters(["offset"], adapter_weights=[0.2])
-
-image = pipeline(
-    enhanced_prompt,
-    width=1152,
-    height=896,
-    guidance_scale=7.5,
-    num_inference_steps=25,
-).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"a cat basking in the sun on a roof in Turkey"</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/enhanced-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"</figcaption>
-  </div>
-</div>
-
-## Prompt weighting
-
 Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).

 Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
@@ -254,7 +55,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
 </div>

-### Weighting
+## Weighting

 You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:

@@ -322,7 +123,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
 </div>

-### Blending
+## Blending

 You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!

@@ -338,7 +139,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
 </div>

-### Conjunction
+## Conjunction

 A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:

@@ -354,7 +155,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
 </div>

-### Textual inversion
+## Textual inversion

 [Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.

@@ -394,7 +195,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
 </div>

-### DreamBooth
+## DreamBooth

 [DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):

@@ -420,7 +221,7 @@ image
  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
 </div>

-### Stable Diffusion XL
+## Stable Diffusion XL

 Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:

@@ -23,7 +23,6 @@ import os
 import re
 import shutil
 import warnings
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Optional

@@ -1845,12 +1844,7 @@ def main(args):
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}

-            if torch.backends.mps.is_available():
-                autocast_ctx = nullcontext()
-            else:
-                autocast_ctx = torch.autocast(accelerator.device.type)
-
-                with autocast_ctx:
+                with torch.cuda.amp.autocast():
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and

 import argparse
+import contextlib
 import gc
 import hashlib
 import itertools
@@ -25,7 +26,6 @@ import random
 import re
 import shutil
 import warnings
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Optional

@@ -2192,12 +2192,13 @@ def main(args):
                # run inference
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
                pipeline_args = {"prompt": args.validation_prompt}
-                if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
-                    autocast_ctx = nullcontext()
-                else:
-                    autocast_ctx = torch.autocast(accelerator.device.type)
+                inference_ctx = (
+                    contextlib.nullcontext()
+                    if "playground" in args.pretrained_model_name_or_path
+                    else torch.cuda.amp.autocast()
+                )

-                with autocast_ctx:
+                with inference_ctx:
                    images = [
                        pipeline(**pipeline_args, generator=generator).images[0]
                        for _ in range(args.num_validation_images)
@@ -430,9 +430,6 @@ def main(args):
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False

    if accelerator.is_main_process:
        os.makedirs(args.output_dir, exist_ok=True)
@@ -10,12 +10,10 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif

 | Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
-|Differential Diffusion|[Differential Diffusion](https://github.com/exx8/differential-diffusion) modifies an image according to a text prompt, and according to a map that specifies the amount of change in each region.|[Differential Diffusion](#differential-diffusion)|[![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/exx8/differential-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/exx8/differential-diffusion/blob/main/examples/SD2.ipynb)|[Eran Levin](https://github.com/exx8) and [Ohad Fried](https://www.ohadf.com/)|
-| HD-Painter                                                                                                                            | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method.                                                                                                                                                                                                                                                                                                               | [HD-Painter](#hd-painter)                                                                 | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/PAIR/HD-Painter)                                                                              | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) |
 | Marigold Monocular Depth Estimation                                                                                                   | A universal monocular depth estimator, utilizing Stable Diffusion, delivering sharp predictions in the wild. (See the [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) for more details.)                                                                                                                                                                                                                                                        | [Marigold Depth Estimation](#marigold-depth-estimation)                                   | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/toshas/marigold) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12G8reD13DdpMie5ZQlaFNo2WCGeNUH-u?usp=sharing) | [Bingxin Ke](https://github.com/markkua) and [Anton Obukhov](https://github.com/toshas) |
 | LLM-grounded Diffusion (LMD+)                                                                                                         | LMD greatly improves the prompt following ability of text-to-image generation models by introducing an LLM as a front-end prompt parser and layout planner. [Project page.](https://llm-grounded-diffusion.github.io/) [See our full codebase (also with diffusers).](https://github.com/TonyLianLong/LLM-groundedDiffusion)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [LLM-grounded Diffusion (LMD+)](#llm-grounded-diffusion)                             | [Huggingface Demo](https://huggingface.co/spaces/longlian/llm-grounded-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SXzMSeAB-LJYISb2yrUOdypLz4OYWUKj) |                [Long (Tony) Lian](https://tonylian.com/) |
 | CLIP Guided Stable Diffusion                                                                                                          | Doing CLIP guidance for text to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                   | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)                             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |                [Suraj Patil](https://github.com/patil-suraj/) |
-| One Step U-Net (Dummy)                                                                                                                | Example showcasing of how to use Community Pipelines (see <https://github.com/huggingface/diffusers/issues/841>)                                                                                                                                                                                                                                                                                                                                                                                           | [One Step U-Net](#one-step-unet)                                                          | -                                                                                                                                                                                                                  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| One Step U-Net (Dummy)                                                                                                                | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841)                                                                                                                                                                                                                                                                                                                                                                                           | [One Step U-Net](#one-step-unet)                                                          | -                                                                                                                                                                                                                  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
 | Stable Diffusion Interpolation                                                                                                        | Interpolate the latent space of Stable Diffusion between different prompts/seeds                                                                                                                                                                                                                                                                                                                                                                                                                         | [Stable Diffusion Interpolation](#stable-diffusion-interpolation)                         | -                                                                                                                                                                                                                  |                       [Nate Raw](https://github.com/nateraw/) |
 | Stable Diffusion Mega                                                                                                                 | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                                           | -                                                                                                                                                                                                                  |    [Patrick von Platen](https://github.com/patrickvonplaten/) |
 | Long Prompt Weighting Stable Diffusion                                                                                                | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)         | -                                                                                                                                                                                                                  |                           [SkyTNT](https://github.com/SkyTNT) |
@@ -46,7 +44,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 | CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) |
 | TensorRT Stable Diffusion Inpainting Pipeline                                                                                                    | Accelerates the Stable Diffusion Inpainting Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Inpainting Pipeline](#tensorrt-inpainting-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
 |   IADB Pipeline                                                                                                    | Implementation of [Iterative α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [IADB Pipeline](#iadb-pipeline)      | - |              [Thomas Chambon](https://github.com/tchambon)
-|   Zero1to3 Pipeline                                                                                                    | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Zero1to3 Pipeline](#zero1to3-pipeline)      | - |              [Xin Kong](https://github.com/kxhit) |
+|   Zero1to3 Pipeline                                                                                                    | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Zero1to3 Pipeline](#Zero1to3-pipeline)      | - |              [Xin Kong](https://github.com/kxhit) |
 | Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LsqilswLR40XLLcp6XFOl5nKb_wOe26W?usp=sharing) | [Andrew Zhu](https://xhinker.medium.com/) |
 | FABRIC - Stable Diffusion with feedback Pipeline | pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipeline](#stable-diffusion-fabric-pipeline) | - | [Shauray Singh](https://shauray8.github.io/about_shauray/) |
 | sketch inpaint - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion Pipeline](#stable-diffusion-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
@@ -58,10 +56,10 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
 | AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
-|   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#demofusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
+|   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 |   Instaflow Pipeline                                                                                                    | Implementation of [InstaFlow! One-Step Stable Diffusion with Rectified Flow](https://arxiv.org/abs/2309.06380)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Instaflow Pipeline](#instaflow-pipeline)      | - |              [Ayush Mangal](https://github.com/ayushtues) |
 |   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
-|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#rerender-a-video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
+|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender-A-Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 | StyleAligned Pipeline                                                                                                    | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [StyleAligned Pipeline](#stylealigned-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 | AnimateDiff Image-To-Video Pipeline | Experimental Image-To-Video support for AnimateDiff (open to improvements) | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   IP Adapter FaceID Stable Diffusion                                                                                               | Stable Diffusion Pipeline that supports IP Adapter Face ID                                                                                                                                                                                                                                                                                                                                                  |  [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
@@ -77,125 +75,6 @@ pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custo

 ## Example usages

-### Differential Diffusion
-
-**Eran Levin, Ohad Fried**
-
-**Tel Aviv University, Reichman University**
-
-Diffusion models have revolutionized image generation and editing, producing state-of-the-art results in conditioned and unconditioned image synthesis. While current techniques enable user control over the degree of change in an image edit, the controllability is limited to global changes over an entire edited region. This paper introduces a novel framework that enables customization of the amount of change per pixel or per image region. Our framework can be integrated into any existing diffusion model, enhancing it with this capability. Such granular control on the quantity of change opens up a diverse array of new editing capabilities, such as control of the extent to which individual objects are modified, or the ability to introduce gradual spatial changes. Furthermore, we showcase the framework's effectiveness in soft-inpainting---the completion of portions of an image while subtly adjusting the surrounding areas to ensure seamless integration. Additionally, we introduce a new tool for exploring the effects of different change quantities. Our framework operates solely during inference, requiring no model training or fine-tuning. We demonstrate our method with the current open state-of-the-art models, and validate it via both quantitative and qualitative comparisons, and a user study.
-
-![teaser-img](https://github.com/exx8/differential-diffusion/raw/main/assets/teaser.png)
-
-You can find additional information about Differential Diffusion in the [paper](https://differential-diffusion.github.io/paper.pdf) or in the [project website](https://differential-diffusion.github.io/).
-
-#### Usage example
-
-```python
-import torch
-from torchvision import transforms
-
-from diffusers import DPMSolverMultistepScheduler
-from diffusers.utils import load_image
-from examples.community.pipeline_stable_diffusion_xl_differential_img2img import (
-    StableDiffusionXLDifferentialImg2ImgPipeline,
-)
-
-
-pipeline = StableDiffusionXLDifferentialImg2ImgPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
-
-
-def preprocess_image(image):
-    image = image.convert("RGB")
-    image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
-    image = transforms.ToTensor()(image)
-    image = image * 2 - 1
-    image = image.unsqueeze(0).to("cuda")
-    return image
-
-
-def preprocess_map(map):
-    map = map.convert("L")
-    map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map)
-    map = transforms.ToTensor()(map)
-    map = map.to("cuda")
-    return map
-
-
-image = preprocess_image(
-    load_image(
-        "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png?download=true"
-    )
-)
-
-mask = preprocess_map(
-    load_image(
-        "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask.png?download=true"
-    )
-)
-
-prompt = "a green pear"
-negative_prompt = "blurry"
-
-image = pipeline(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    guidance_scale=7.5,
-    num_inference_steps=25,
-    original_image=image,
-    image=image,
-    strength=1.0,
-    map=mask,
-).images[0]
-
-image.save("result.png")
-```
-
-### HD-Painter
-
-Implementation of [HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image Inpainting with Diffusion Models](https://arxiv.org/abs/2312.14091).
-
-![teaser-img](https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/github/teaser.jpg)
-
-The abstract from the paper is:
-
-Recent progress in text-guided image inpainting, based on the unprecedented success of text-to-image diffusion models, has led to exceptionally realistic and visually plausible results.
-However, there is still significant potential for improvement in current text-to-image inpainting models, particularly in better aligning the inpainted area with user prompts and performing high-resolution inpainting.
-Therefore, in this paper we introduce _HD-Painter_, a completely **training-free** approach that **accurately follows to prompts** and coherently **scales to high-resolution** image inpainting.
-To this end, we design the _Prompt-Aware Introverted Attention (PAIntA)_ layer enhancing self-attention scores by prompt information and resulting in better text alignment generations.
-To further improve the prompt coherence we introduce the _Reweighting Attention Score Guidance (RASG)_ mechanism seamlessly integrating a post-hoc sampling strategy into general form of DDIM to prevent out-of-distribution latent shifts.
-Moreover, HD-Painter allows extension to larger scales by introducing a specialized super-resolution technique customized for inpainting, enabling the completion of missing regions in images of up to 2K resolution.
-Our experiments demonstrate that HD-Painter surpasses existing state-of-the-art approaches qualitatively and quantitatively, achieving an impressive generation accuracy improvement of **61.4** vs **51.9**.
-We will make the codes publicly available.
-
-You can find additional information about Text2Video-Zero in the [paper](https://arxiv.org/abs/2312.14091) or the [original codebase](https://github.com/Picsart-AI-Research/HD-Painter).
-
-#### Usage example
-
-```python
-import torch
-from diffusers import DiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-inpainting",
-    custom_pipeline="hd_painter"
-)
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-
-prompt = "wooden boat"
-init_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/images/2.jpg")
-mask_image = load_image("https://raw.githubusercontent.com/Picsart-AI-Research/HD-Painter/main/__assets__/samples/masks/2.png")
-
-image = pipe (prompt, init_image, mask_image, use_rasg = True, use_painta = True, generator=torch.manual_seed(12345)).images[0]
-
-make_image_grid([init_image, mask_image, image], rows=1, cols=3)
-
-```
-
 ### Marigold Depth Estimation

 Marigold is a universal monocular depth estimator that delivers accurate and sharp predictions in the wild. Based on Stable Diffusion, it is trained exclusively with synthetic depth data and excels in zero-shot adaptation to real-world imagery. This pipeline is an official implementation of the inference process. More details can be found on our [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) (also implemented with diffusers).
@@ -275,7 +154,6 @@ This pipeline can be used with an LLM or on its own. We provide a parser that pa
 The following code has been tested on 1x RTX 4090, but it should also support GPUs with lower GPU memory.

 #### Use this pipeline with an LLM
-
 ```python
 import torch
 from diffusers import DiffusionPipeline
@@ -311,7 +189,6 @@ images[0].save("./lmd_plus_generation.jpg")
 ```

 #### Use this pipeline on its own for layout generation
-
 ```python
 import torch
 from diffusers import DiffusionPipeline
@@ -407,7 +284,7 @@ pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeli
 pipe()
 ```

-**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see <https://github.com/huggingface/diffusers/issues/841>).
+**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see https://github.com/huggingface/diffusers/issues/841).

 ### Stable Diffusion Interpolation

@@ -441,7 +318,7 @@ frame_filepaths = pipe.walk(

 The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion.

-> **Please have a look at <https://github.com/nateraw/stable-diffusion-videos> for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
+> **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**

 ### Stable Diffusion Mega

@@ -491,9 +368,7 @@ images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, st
 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.

 ### Long Prompt Weighting Stable Diffusion
-
 Features of this custom pipeline:
-
 - Input a prompt without the 77 token length limit.
 - Includes tx2img, img2img. and inpainting pipelines.
 - Emphasize/weigh part of your prompt with parentheses as so: `a baby deer with (big eyes)`
@@ -501,7 +376,6 @@ Features of this custom pipeline:
 - Precisely weigh part of your prompt as so: `a baby deer with (big eyes:1.3)`

 Prompt weighting equivalents:
-
 - `a baby deer with` == `(a baby deer with:1.0)`
 - `(big eyes)` == `(big eyes:1.1)`
 - `((big eyes))` == `(big eyes:1.21)`
@@ -595,14 +469,12 @@ diffuser_pipeline = diffuser_pipeline.to(device)
 output = diffuser_pipeline(speech_data)
 plt.imshow(output.images[0])
 ```
-
 This example produces the following image:

 ![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png)

 ### Wildcard Stable Diffusion
-
-Following the great examples from <https://github.com/jtkelm2/stable-diffusion-webui-1/blob/master/scripts/wildcards.py> and <https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Custom-Scripts#wildcards>, here's a minimal implementation that allows for users to add "wildcards", denoted by `__wildcard__` to prompts that are used as placeholders for randomly sampled values given by either a dictionary or a `.txt` file. For example:
+Following the great examples from https://github.com/jtkelm2/stable-diffusion-webui-1/blob/master/scripts/wildcards.py and https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Custom-Scripts#wildcards, here's a minimal implementation that allows for users to add "wildcards", denoted by `__wildcard__` to prompts that are used as placeholders for randomly sampled values given by either a dictionary or a `.txt` file. For example:

 Say we have a prompt:

@@ -709,7 +581,6 @@ tvu.save_image(grid, f'{prompt}_{args.weights}' + '.png')
 ```

 ### Imagic Stable Diffusion
-
 Allows you to edit an image using stable diffusion.

 ```python
@@ -751,7 +622,6 @@ image.save('./imagic/imagic_image_alpha_2.png')
 ```

 ### Seed Resizing
-
 Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.

 ```python
@@ -902,7 +772,6 @@ This example produces the following images:
 ![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)

 ### GlueGen Stable Diffusion Pipeline
-
 GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours.

 Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main)
@@ -941,7 +810,6 @@ if __name__ == "__main__":
    image = pipeline(prompt, generator=generator).images[0]
    image.save("gluegen_output_fr.png")
 ```
-
 Which will produce:

 ![output_image](https://github.com/rootonchair/diffusers/assets/23548268/db43ffb6-8667-47c1-8872-26f85dc0a57f)
@@ -1016,8 +884,7 @@ image = pipe(image=image, text=text, prompt=prompt).images[0]
 ```

 ### Bit Diffusion
-
-Based <https://arxiv.org/abs/2208.04202>, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this:
+Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this:

 ```python
 from diffusers import DiffusionPipeline
@@ -1028,7 +895,7 @@ image = pipe().images[0]

 ### Stable Diffusion with K Diffusion

-Make sure you have @crowsonkb's <https://github.com/crowsonkb/k-diffusion> installed:
+Make sure you have @crowsonkb's https://github.com/crowsonkb/k-diffusion installed:

 ```
 pip install k-diffusion
@@ -1053,7 +920,6 @@ image.save("./astronaut_heun_k_diffusion.png")
 To make sure that K Diffusion and `diffusers` yield the same results:

 **Diffusers**:
-
 ```python
 from diffusers import DiffusionPipeline, EulerDiscreteScheduler

@@ -1070,7 +936,6 @@ image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
 ![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler.png)

 **K Diffusion**:
-
 ```python
 from diffusers import DiffusionPipeline, EulerDiscreteScheduler

@@ -1088,14 +953,12 @@ image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
 ![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler_k_diffusion.png)

 ### Checkpoint Merger Pipeline
-
 Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges upto 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format.

 The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM Usage on Kaggle GPU kernels and
 on colab you might run out of the 12GB memory even while merging two checkpoints.

 Usage:-
-
 ```python
 from diffusers import DiffusionPipeline

@@ -1121,7 +984,6 @@ prompt = "An astronaut riding a horse on Mars"
 image = merged_pipe(prompt).images[0]

 ```
-
 Some examples along with the merge details:

 1. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" ; Sigmoid interpolation; alpha = 0.8
@@ -1132,14 +994,15 @@ Some examples along with the merge details:

 ![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png)

+
 3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5

 ![Stable plus Waifu plus openjourney add_diff 0.5](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/stable_waifu_openjourney_add_diff_0.5.png)

+
 ### Stable Diffusion Comparisons

 This Community Pipeline enables the comparison between the 4 checkpoints that exist for Stable Diffusion. They can be found through the following links:
-
 1. [Stable Diffusion v1.1](https://huggingface.co/CompVis/stable-diffusion-v1-1)
 2. [Stable Diffusion v1.2](https://huggingface.co/CompVis/stable-diffusion-v1-2)
 3. [Stable Diffusion v1.3](https://huggingface.co/CompVis/stable-diffusion-v1-3)
@@ -1182,7 +1045,6 @@ As a result, you can look at a grid of all 4 generated images being shown togeth
 Implementation of the [MagicMix: Semantic Mixing with Diffusion Models](https://arxiv.org/abs/2210.16056) paper. This is a Diffusion Pipeline for semantic mixing of an image and a text prompt to create a new concept while preserving the spatial layout and geometry of the subject in the image. The pipeline takes an image that provides the layout semantics and a prompt that provides the content semantics for the mixing process.

 There are 3 parameters for the method-
-
 - `mix_factor`: It is the interpolation constant used in the layout generation phase. The greater the value of `mix_factor`, the greater the influence of the prompt on the layout generation process.
 - `kmax` and `kmin`: These determine the range for the layout and content generation process. A higher value of kmax results in loss of more information about the layout of the original image and a higher value of kmin results in more steps for content generation process.

@@ -1208,7 +1070,6 @@ mix_img = pipe(
    )
 mix_img.save('phone_bed_mix.jpg')
 ```
-
 The `mix_img` is a PIL image that can be saved locally or displayed directly in a google colab. Generated image is a mix of the layout semantics of the given image and the content semantics of the prompt.

 E.g. the above script generates the following image:
@@ -1223,6 +1084,7 @@ E.g. the above script generates the following image:

 For more example generations check out this [demo notebook](https://github.com/daspartho/MagicMix/blob/main/demo.ipynb).

+
 ### Stable UnCLIP

 UnCLIPPipeline("kakaobrain/karlo-v1-alpha") provide a prior model that can generate clip image embedding from text.
@@ -1304,8 +1166,10 @@ print(pipeline.prior_scheduler)
 # }
 ```

+
 `shiba-inu.jpg`

+
 ![shiba-inu](https://user-images.githubusercontent.com/16448529/209185639-6e5ec794-ce9d-4883-aa29-bd6852a2abad.jpg)

 ### UnCLIP Text Interpolation Pipeline
@@ -1373,7 +1237,6 @@ output = pipe(image = images ,steps = 6, generator = generator)
 for i,image in enumerate(output.images):
    image.save('starry_to_flowers_%s.jpg' % i)
 ```
-
 The original images:-

 ![starry](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_night.jpg)
@@ -1389,9 +1252,7 @@ The resulting images in order:-
 ![result5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_5.png)

 ### DDIM Noise Comparative Analysis Pipeline
-
 #### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
-
 The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution.
 The approach consists of the following steps:

@@ -1521,7 +1382,6 @@ image.save('tensorrt_mt_fuji.png')
 ### EDICT Image Editing Pipeline

 This pipeline implements the text-guided image editing approach from the paper [EDICT: Exact Diffusion Inversion via Coupled Transformations](https://arxiv.org/abs/2211.12446). You have to pass:
-
 - (`PIL`) `image` you want to edit.
 - `base_prompt`: the text prompt describing the current image (before editing).
 - `target_prompt`: the text prompt describing with the edits.
@@ -1681,7 +1541,6 @@ image.save('tensorrt_img2img_new_zealand_hills.png')
 This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).

 Based on [this issue](https://github.com/huggingface/diffusers/issues/3566),
-
 - `EulerAncestralDiscreteScheduler` got poor results.

 ```py
@@ -1727,7 +1586,6 @@ Output Image of `reference_attn=True` and `reference_adain=True`
 This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).

 Based on [this issue](https://github.com/huggingface/diffusers/issues/3566),
-
 - `EulerAncestralDiscreteScheduler` got poor results.
 - `guess_mode=True` works well for ControlNet v1.1

@@ -1773,12 +1631,12 @@ Output Image

 ![output_image](https://github.com/huggingface/diffusers/assets/24734142/7b9a5830-f173-4b92-b0cf-73d0e9c01d60)

+
 ### Stable Diffusion on IPEX

 This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).

 To use this pipeline, you need to:
-
 1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)

 **Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
@@ -1789,13 +1647,10 @@ To use this pipeline, you need to:
 |[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|

 You can simply use pip to install IPEX with the latest version.
-
 ```python
 python -m pip install intel_extension_for_pytorch
 ```
-
 **Note:** To install a specific version, run with the following command:
-
 ```
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
@@ -1803,7 +1658,6 @@ python -m pip install intel_extension_for_pytorch==<version_name> -f https://dev
 2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.

 **Note:** The setting of generated image height/width for `prepare_for_ipex()` should be same as the setting of pipeline inference.
-
 ```python
 pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex")
 # For Float32
@@ -1813,7 +1667,6 @@ pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #valu
 ```

 Then you can use the ipex pipeline in a similar way to the default stable diffusion pipeline.
-
 ```python
 # For Float32
 image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()'
@@ -1882,7 +1735,6 @@ print("Latency of StableDiffusionPipeline--fp32",latency)
 This diffusion pipeline aims to accelarate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).

 To use this pipeline, you need to:
-
 1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)

 **Note:** For each PyTorch release, there is a corresponding release of IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
@@ -1893,13 +1745,10 @@ To use this pipeline, you need to:
 |[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|

 You can simply use pip to install IPEX with the latest version.
-
 ```python
 python -m pip install intel_extension_for_pytorch
 ```
-
 **Note:** To install a specific version, run with the following command:
-
 ```
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
@@ -1918,7 +1767,6 @@ pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
 ```

 Then you can use the ipex pipeline in a similar way to the default stable diffusion xl pipeline.
-
 ```python
 # value of image height/width should be consistent with 'prepare_for_ipex()'
 # For Float32
@@ -1995,6 +1843,7 @@ CLIP guided stable diffusion images mixing pipeline allows to combine two images
 This approach is using (optional) CoCa model to avoid writing image description.
 [More code examples](https://github.com/TheDenk/images_mixing)

+
 ### Stable Diffusion XL Long Weighted Prompt Pipeline

 This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style.
@@ -2061,7 +1910,6 @@ In the above code, the `prompt2` is appended to the `prompt`, which is more than
 For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).

 ### Example Images Mixing (with CoCa)
-
 ```python
 import requests
 from io import BytesIO
@@ -2164,7 +2012,6 @@ image = pipeline(
    num_inference_steps=50,
 )["images"][0]
 ```
-
 ![mixture_tiling_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/mixture_tiling.png)

 ### TensorRT Inpainting Stable Diffusion Pipeline
@@ -2241,10 +2088,10 @@ output = pipeline(
    seed=5525475061,
 )["images"][0]
 ```
-
 ![Input_Image](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/input_image.png)
 ![mixture_canvas_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/canvas.png)

+
 ### IADB pipeline

 This pipeline is the implementation of the [α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486) paper.
@@ -2328,7 +2175,7 @@ pipe = pipe.to("cuda")
 num_images_per_prompt = 4

 # test inference pipeline
-# x y z, Polar angle (vertical rotation in degrees)  Azimuth angle (horizontal rotation in degrees)  Zoom (relative distance from center)
+# x y z, Polar angle (vertical rotation in degrees) 	Azimuth angle (horizontal rotation in degrees) 	Zoom (relative distance from center)
 query_pose1 = [-75.0, 100.0, 0.0]
 query_pose2 = [-20.0, 125.0, 0.0]
 query_pose3 = [-55.0, 90.0, 0.0]
@@ -2387,6 +2234,7 @@ for obj in range(bs):

 This pipeline uses the Reference . Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference).

+
 ```py
 import torch
 from PIL import Image
@@ -2431,6 +2279,7 @@ Output Image
 Reference Image
 ![reference_image](https://github.com/huggingface/diffusers/assets/34944964/449bdab6-e744-4fb2-9620-d4068d9a741b)

+
 Output Image

 `prompt: A dog`
@@ -2454,6 +2303,7 @@ FABRIC approach applicable to a wide range of popular diffusion models, which ex
 the self-attention layer present in the most widely used architectures to condition
 the diffusion process on a set of feedback images.

+
 ```python
 import requests
 import torch
@@ -2507,12 +2357,13 @@ image.save("black_to_blue.png")

 The original codebase can be found at [sd-fabric/fabric](https://github.com/sd-fabric/fabric), and available checkpoints are [dreamlike-art/dreamlike-photoreal-2.0](https://huggingface.co/dreamlike-art/dreamlike-photoreal-2.0), [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), and [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (may give unexpected results).

-Let's have a look at the images (_512X512_)
+Let's have a look at the images (*512X512*)

 | Without Feedback            | With Feedback  (1st image)          |
 |---------------------|---------------------|
 | ![Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_wo_feedback.jpg) | ![Feedback Image 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fabric_w_feedback.png) |

+
 ### Masked Im2Im Stable Diffusion Pipeline

 This pipeline reimplements sketch inpaint feature from A1111 for non-inpaint models. The following code reads two images, original and one with mask painted over it. It computes mask as a difference of two images and does the inpainting in the area defined by the mask.
@@ -2538,20 +2389,20 @@ result.images[0].save("result.png")

 original image mech.png

-<img src=<https://github.com/noskill/diffusers/assets/733626/10ad972d-d655-43cb-8de1-039e3d79e849> width="25%" >
+<img src=https://github.com/noskill/diffusers/assets/733626/10ad972d-d655-43cb-8de1-039e3d79e849 width="25%" >

 image with mask mech_painted.png

-<img src=<https://github.com/noskill/diffusers/assets/733626/c334466a-67fe-4377-9ff7-f46021b9c224> width="25%" >
+<img src=https://github.com/noskill/diffusers/assets/733626/c334466a-67fe-4377-9ff7-f46021b9c224 width="25%" >

 result:

-<img src=<https://github.com/noskill/diffusers/assets/733626/23a0a71d-51db-471e-926a-107ac62512a8> width="25%" >
+<img src=https://github.com/noskill/diffusers/assets/733626/23a0a71d-51db-471e-926a-107ac62512a8 width="25%" >
+

 ### Prompt2Prompt Pipeline

 Prompt2Prompt allows the following edits:
-
 - ReplaceEdit (change words in prompt)
 - ReplaceEdit with local blend (change words in prompt, keep image part unrelated to changes constant)
 - RefineEdit (add words to prompt)
@@ -2583,7 +2434,6 @@ outputs = pipe(prompt=prompts, height=512, width=512, num_inference_steps=50, cr
 And abbreviated examples for the other edits:

 `ReplaceEdit with local blend`
-
 ```python
 prompts = ["A turtle playing with a ball",
           "A monkey playing with a ball"]
@@ -2597,7 +2447,6 @@ cross_attention_kwargs = {
 ```

 `RefineEdit`
-
 ```python
 prompts = ["A turtle",
           "A turtle in a forest"]
@@ -2610,7 +2459,6 @@ cross_attention_kwargs = {
 ```

 `RefineEdit with local blend`
-
 ```python
 prompts = ["A turtle",
           "A turtle in a forest"]
@@ -2624,7 +2472,6 @@ cross_attention_kwargs = {
 ```

 `ReweightEdit`
-
 ```python
 prompts = ["A smiling turtle"] * 2

@@ -2641,7 +2488,7 @@ Side note: See [this GitHub gist](https://gist.github.com/UmerHA/b65bb5fb9626c9c

 ### Latent Consistency Pipeline

-Latent Consistency Models was proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378) by _Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, Hang Zhao_ from Tsinghua University.
+Latent Consistency Models was proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378) by *Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, Hang Zhao* from Tsinghua University.

 The abstract of the paper reads as follows:

@@ -2649,7 +2496,7 @@ The abstract of the paper reads as follows:

 The model can be used with `diffusers` as follows:

- *1. Load the model from the community pipeline.*
+ - *1. Load the model from the community pipeline.*

 ```py
 from diffusers import DiffusionPipeline
@@ -2676,6 +2523,8 @@ For any questions or feedback, feel free to reach out to [Simian Luo](https://gi

 You can also try this pipeline directly in the [🚀 official spaces](https://huggingface.co/spaces/SimianLuo/Latent_Consistency_Model).

+
+
 ### Latent Consistency Img2img Pipeline

 This pipeline extends the Latent Consistency Pipeline to allow it to take an input image.
@@ -2706,6 +2555,8 @@ num_inference_steps = 4
 images = pipe(prompt=prompt, image=input_image, strength=strength, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
 ```

+
+
 ### Latent Consistency Interpolation Pipeline

 This pipeline extends the Latent Consistency Pipeline to allow for interpolation of the latent space between multiple prompts. It is similar to the [Stable Diffusion Interpolate](https://github.com/huggingface/diffusers/blob/main/examples/community/interpolate_stable_diffusion.py) and [unCLIP Interpolate](https://github.com/huggingface/diffusers/blob/main/examples/community/unclip_text_interpolation.py) community pipelines.
@@ -2751,15 +2602,13 @@ images = pipe(
 assert len(images) == (len(prompts) - 1) * num_interpolation_steps
 ```

-### StableDiffusionUpscaleLDM3D Pipeline
-
+###  StableDiffusionUpscaleLDM3D Pipeline
 [LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D.

 The abstract from the paper is:
 *Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods*

 Two checkpoints are available for use:
-
 - [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used.
 - [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline pipeline.

@@ -2769,8 +2618,7 @@ import os
 import torch
 from diffusers import StableDiffusionLDM3DPipeline, DiffusionPipeline

-# Generate a rgb/depth output from LDM3D
-
+#Generate a rgb/depth output from LDM3D
 pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c")
 pipe_ldm3d.to("cuda")

@@ -2780,8 +2628,8 @@ rgb_image, depth_image = output.rgb, output.depth
 rgb_image[0].save(f"lemons_ldm3d_rgb.jpg")
 depth_image[0].save(f"lemons_ldm3d_depth.png")

-# Upscale the previous output to a resolution of (1024, 1024)

+#Upscale the previous output to a resolution of (1024, 1024)
 pipe_ldm3d_upscale = DiffusionPipeline.from_pretrained("Intel/ldm3d-sr", custom_pipeline="pipeline_stable_diffusion_upscale_ldm3d")

 pipe_ldm3d_upscale.to("cuda")
@@ -2796,7 +2644,6 @@ upscaled_depth.save(f"upscaled_lemons_depth.png")
 '''

 ### ControlNet + T2I Adapter Pipeline
-
 This pipelines combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once.
 It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale = 0` or `controlnet_conditioning_scale = 0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively.

@@ -2865,7 +2712,6 @@ images[0].save("controlnet_and_adapter.png")
 ```

 ### ControlNet + T2I Adapter + Inpainting Pipeline
-
 ```py
 import cv2
 import numpy as np
@@ -2936,16 +2782,13 @@ images[0].save("controlnet_and_adapter_inpaint.png")
 ```

 ### Regional Prompting Pipeline
-
 This pipeline is a port of the [Regional Prompter extension](https://github.com/hako-mikan/sd-webui-regional-prompter) for [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to diffusers.
 This code implements a pipeline for the Stable Diffusion model, enabling the division of the canvas into multiple regions, with different prompts applicable to each region. Users can specify regions in two ways: using `Cols` and `Rows` modes for grid-like divisions, or the `Prompt` mode for regions calculated based on prompts.

 ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline1.png)

 ### Usage
-
 ### Sample Code
-
 ```
 from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
 pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)
@@ -2979,14 +2822,11 @@ for image in images:
    fileName = f'img-{time}-{i+1}.png'
    image.save(fileName)
 ```
-
 ### Cols, Rows mode
-
 In the Cols, Rows mode, you can split the screen vertically and horizontally and assign prompts to each region. The split ratio can be specified by 'div', and you can set the division ratio like '3;3;2' or '0.1;0.5'. Furthermore, as will be described later, you can also subdivide the split Cols, Rows to specify more complex regions.

 In this image, the image is divided into three parts, and a separate prompt is applied to each. The prompts are divided by 'BREAK', and each is applied to the respective region.
 ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline2.png)
-
 ```
 green hair twintail BREAK
 red blouse BREAK
@@ -2994,9 +2834,7 @@ blue skirt
 ```

 ### 2-Dimentional division
-
 The prompt consists of instructions separated by the term `BREAK` and is assigned to different regions of a two-dimensional space. The image is initially split in the main splitting direction, which in this case is rows, due to the presence of a single semicolon`;`, dividing the space into an upper and a lower section. Additional sub-splitting is then applied, indicated by commas. The upper row is split into ratios of `2:1:1`, while the lower row is split into a ratio of `4:6`. Rows themselves are split in a `1:2` ratio. According to the reference image, the blue sky is designated as the first region, green hair as the second, the bookshelf as the third, and so on, in a sequence based on their position from the top left. The terrarium is placed on the desk in the fourth region, and the orange dress and sofa are in the fifth region, conforming to their respective splits.
-
 ```
 rp_args = {
    "mode":"rows",
@@ -3011,16 +2849,12 @@ terrarium on desk BREAK
 orange dress and sofa
 """
 ```
-
 ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline4.png)

 ### Prompt Mode
-
 There are limitations to methods of specifying regions in advance. This is because specifying regions can be a hindrance when designating complex shapes or dynamic compositions. In the region specified by the prompt, the regions is determined after the image generation has begun. This allows us to accommodate compositions and complex regions.
 For further infomagen, see [here](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/main/prompt_en.md).
-
 ### syntax
-
 ```
 baseprompt target1 target2 BREAK
 effect1, target1 BREAK
@@ -3034,13 +2868,10 @@ target2 baseprompt target1  BREAK
 effect1, target1 BREAK
 effect2 ,target2
 ```
-
 is also effective.

 ### Sample
-
 In this example, masks are calculated for shirt, tie, skirt, and color prompts are specified only for those regions.
-
 ```
 rp_args = {
    "mode":"prompt-ex",
@@ -3055,11 +2886,8 @@ green, tie BREAK
 blue , skirt
 """
 ```
-
 ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline3.png)
-
 ### threshold
-
 The threshold used to determine the mask created by the prompt. This can be set as many times as there are masks, as the range varies widely depending on the target prompt. If multiple regions are used, enter them separated by commas. For example, hair tends to be ambiguous and requires a small value, while face tends to be large and requires a small value. These should be ordered by BREAK.

 ```
@@ -3067,56 +2895,44 @@ a lady ,hair, face  BREAK
 red, hair BREAK
 tanned ,face
 ```
-
 `threshold : 0.4,0.6`
 If only one input is given for multiple regions, they are all assumed to be the same value.

 ### Prompt and Prompt-EX
-
 The difference is that in Prompt, duplicate regions are added, whereas in Prompt-EX, duplicate regions are overwritten sequentially. Since they are processed in order, setting a TARGET with a large regions first makes it easier for the effect of small regions to remain unmuffled.

 ### Accuracy
-
 In the case of a 512 x 512 image, Attention mode reduces the size of the region to about 8 x 8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact.
-
 ```
 girl hair twintail frills,ribbons, dress, face BREAK
 girl, ,face
 ```

 ### Mask
-
 When an image is generated, the generated mask is displayed. It is generated at the same size as the image, but is actually used at a much smaller size.

+
 ### Use common prompt
-
 You can attach the prompt up to ADDCOMM to all prompts by separating it first with ADDCOMM. This is useful when you want to include elements common to all regions. For example, when generating pictures of three people with different appearances, it's necessary to include the instruction of 'three people' in all regions. It's also useful when inserting quality tags and other things."For example, if you write as follows:
-
 ```
 best quality, 3persons in garden, ADDCOMM
 a girl white dress BREAK
 a boy blue shirt BREAK
 an old man red suit
 ```
-
 If common is enabled, this prompt is converted to the following:
-
 ```
 best quality, 3persons in garden, a girl white dress BREAK
 best quality, 3persons in garden, a boy blue shirt BREAK
 best quality, 3persons in garden, an old man red suit
 ```
-
 ### Negative prompt
-
 Negative prompts are equally effective across all regions, but it is possible to set region-specific prompts for negative prompts as well. The number of BREAKs must be the same as the number of prompts. If the number of prompts does not match, the negative prompts will be used without being divided into regions.

 ### Parameters
-
 To activate Regional Prompter, it is necessary to enter settings in rp_args. The items that can be set are as follows. rp_args is a dictionary type.

 ### Input Parameters
-
 Parameters are specified through the `rp_arg`(dictionary type).

 ```
@@ -3128,22 +2944,20 @@ rp_args = {
 pipe(prompt =prompt, rp_args = rp_args)
 ```

-### Required Parameters

+
+### Required Parameters
 - `mode`: Specifies the method for defining regions. Choose from `Cols`, `Rows`, `Prompt` or `Prompt-Ex`. This parameter is case-insensitive.
 - `divide`: Used in `Cols` and `Rows` modes. Details on how to specify this are provided under the respective `Cols` and `Rows` sections.
 - `th`: Used in `Prompt` mode. The method of specification is detailed under the `Prompt` section.

 ### Optional Parameters
-
 - `save_mask`: In `Prompt` mode, choose whether to output the generated mask along with the image. The default is `False`.

 The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.

 ### Diffusion Posterior Sampling Pipeline
-
- Reference paper
-
+* Reference paper
    ```
    @article{chung2022diffusion,
    title={Diffusion posterior sampling for general noisy inverse problems},
@@ -3152,12 +2966,9 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
    year={2022}
    }
    ```
-
- This pipeline allows zero-shot conditional sampling from the posterior distribution $p(x|y)$, given observation on $y$, unconditional generative model $p(x)$ and differentiable operator $y=f(x)$.
-
- For example, $f(.)$ can be downsample operator, then $y$ is a downsampled image, and the pipeline becomes a super-resolution pipeline.
- To use this pipeline, you need to know your operator $f(.)$ and corrupted image $y$, and pass them during the call. For example, as in the main function of dps_pipeline.py, you need to first define the Gaussian blurring operator $f(.)$. The operator should be a callable nn.Module, with all the parameter gradient disabled:
-
+* This pipeline allows zero-shot conditional sampling from the posterior distribution $p(x|y)$, given observation on $y$, unconditional generative model $p(x)$ and differentiable operator $y=f(x)$.
+* For example, $f(.)$ can be downsample operator, then $y$ is a downsampled image, and the pipeline becomes a super-resolution pipeline.
+* To use this pipeline, you need to know your operator $f(.)$ and corrupted image $y$, and pass them during the call. For example, as in the main function of dps_pipeline.py, you need to first define the Gaussian blurring operator $f(.)$. The operator should be a callable nn.Module, with all the parameter gradient disabled:
    ```python
    import torch.nn.functional as F
    import scipy
@@ -3227,9 +3038,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
        def get_kernel(self):
            return self.kernel.view(1, 1, self.kernel_size, self.kernel_size)
    ```
-
- Next, you should obtain the corrupted image $y$ by the operator. In this example, we generate $y$ from the source image $x$. However in practice, having the operator $f(.)$ and corrupted image $y$ is enough:
-
+* Next, you should obtain the corrupted image $y$ by the operator. In this example, we generate $y$ from the source image $x$. However in practice, having the operator $f(.)$ and corrupted image $y$ is enough:
    ```python
    # set up source image
    src = Image.open('sample.png')
@@ -3247,23 +3056,18 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
    save_image((src+1.0)/2.0, "dps_src.png")
    save_image((measurement+1.0)/2.0, "dps_mea.png")
    ```
-
- We provide an example pair of saved source and corrupted images, using the Gaussian blur operator above
-  - Source image:
-  - ![sample](https://github.com/tongdaxu/Images/assets/22267548/4d2a1216-08d1-4aeb-9ce3-7a2d87561d65)
-  - Gaussian blurred image:
-  - ![ddpm_generated_image](https://github.com/tongdaxu/Images/assets/22267548/65076258-344b-4ed8-b704-a04edaade8ae)
-  - You can download those image to run the example on your own.
-
- Next, we need to define a loss function used for diffusion posterior sample. For most of the cases, the RMSE is fine:
-
+* We provide an example pair of saved source and corrupted images, using the Gaussian blur operator above
+    * Source image:
+    * ![sample](https://github.com/tongdaxu/Images/assets/22267548/4d2a1216-08d1-4aeb-9ce3-7a2d87561d65)
+    * Gaussian blurred image:
+    * ![ddpm_generated_image](https://github.com/tongdaxu/Images/assets/22267548/65076258-344b-4ed8-b704-a04edaade8ae)
+    * You can download those image to run the example on your own.
+* Next, we need to define a loss function used for diffusion posterior sample. For most of the cases, the RMSE is fine:
    ```python
    def RMSELoss(yhat, y):
        return torch.sqrt(torch.sum((yhat-y)**2))
    ```
-
- And next, as any other diffusion models, we need the score estimator and scheduler. As we are working with $256x256$ face images, we use ddmp-celebahq-256:
-
+* And next, as any other diffusion models, we need the score estimator and scheduler. As we are working with $256x256$ face images, we use ddmp-celebahq-256:
    ```python
    # set up scheduler
    scheduler = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256")
@@ -3272,9 +3076,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
    # set up model
    model = UNet2DModel.from_pretrained("google/ddpm-celebahq-256").to("cuda")
    ```
-
- And finally, run the pipeline:
-
+* And finally, run the pipeline:
    ```python
    # finally, the pipeline
    dpspipe = DPSPipeline(model, scheduler)
@@ -3286,17 +3088,15 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
    ).images[0]
    image.save("dps_generated_image.png")
    ```
-
- The zeta is a hyperparameter that is in range of $[0,1]$. It need to be tuned for best effect. By setting zeta=1, you should be able to have the reconstructed result:
-  - Reconstructed image:
-  - ![sample](https://github.com/tongdaxu/Images/assets/22267548/0ceb5575-d42e-4f0b-99c0-50e69c982209)
-
- The reconstruction is perceptually similar to the source image, but different in details.
- In dps_pipeline.py, we also provide a super-resolution example, which should produce:
-  - Downsampled image:
-  - ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13)
-  - Reconstructed image:
-  - ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)
+* The zeta is a hyperparameter that is in range of $[0,1]$. It need to be tuned for best effect. By setting zeta=1, you should be able to have the reconstructed result:
+    * Reconstructed image:
+    * ![sample](https://github.com/tongdaxu/Images/assets/22267548/0ceb5575-d42e-4f0b-99c0-50e69c982209)
+* The reconstruction is perceptually similar to the source image, but different in details.
+* In dps_pipeline.py, we also provide a super-resolution example, which should produce:
+    * Downsampled image:
+    * ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13)
+    * Reconstructed image:
+    * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)

 ### AnimateDiff ControlNet Pipeline

@@ -3440,7 +3240,6 @@ export_to_gif(result.frames[0], "result.gif")

 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
-
 - `view_batch_size` (`int`, defaults to 16):
  The batch size for multiple denoising paths. Typically, a larger batch size can result in higher efficiency but comes with increased GPU memory requirements.

@@ -3464,7 +3263,6 @@ The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).

 - `show_image` (`bool`, defaults to False):
  Determine whether to show intermediate results during generation.
-
 ```py
 from diffusers import DiffusionPipeline

@@ -3496,9 +3294,7 @@ images = pipe(
    show_image=True
 )
 ```
-
 You can display and save the generated images as:
-
 ```py
 def image_grid(imgs, save_path=None):

@@ -3522,7 +3318,6 @@ def image_grid(imgs, save_path=None):

 image_grid(images, save_path="./outputs/")
 ```
-
 ![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)

 ### SDE Drag pipeline
@@ -3565,7 +3360,6 @@ output_image.save("./output.png")
 ```

 ### Instaflow Pipeline
-
 InstaFlow is an ultra-fast, one-step image generator that achieves image quality close to Stable Diffusion, significantly reducing the demand of computational resources. This efficiency is made possible through a recent [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) technique, which trains probability flows with straight trajectories, hence inherently requiring only a single step for fast inference.

 ```python
@@ -3582,10 +3376,9 @@ images = pipe(prompt=prompt,
            guidance_scale=0.0).images
 images[0].save("./image.png")
 ```
-
 ![image1](https://huggingface.co/datasets/ayushtues/instaflow_images/resolve/main/instaflow_cat.png)

-You can also combine it with LORA out of the box, like <https://huggingface.co/artificialguybr/logo-redmond-1-5v-logo-lora-for-liberteredmond-sd-1-5>, to unlock cool use cases in single step!
+You can also combine it with LORA out of the box, like https://huggingface.co/artificialguybr/logo-redmond-1-5v-logo-lora-for-liberteredmond-sd-1-5, to unlock cool use cases in single step!

 ```python
 from diffusers import DiffusionPipeline
@@ -3601,15 +3394,12 @@ images = pipe(prompt=prompt,
            guidance_scale=0.0).images
 images[0].save("./image.png")
 ```
-
 ![image0](https://huggingface.co/datasets/ayushtues/instaflow_images/resolve/main/instaflow_logo.png)

 ### Null-Text Inversion pipeline

 This pipeline provides null-text inversion for editing real images. It enables null-text optimization, and DDIM reconstruction via w, w/o null-text optimization. No prompt-to-prompt code is implemented as there is a Prompt2PromptPipeline.
-
- Reference paper
-
+* Reference paper
    ```@article{hertz2022prompt,
  title={Prompt-to-prompt image editing with cross attention control},
  author={Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and Cohen-Or, Daniel},
@@ -3819,10 +3609,12 @@ export_to_gif(frames, "animation.gif")
 IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by `insightface`, so no image encoder needs to be loaded.
 You need to install `insightface` and all its requirements to use this model.
 You must pass the image embedding tensor as `image_embeds` to the StableDiffusionPipeline instead of `ip_adapter_image`.
+You have to disable PEFT BACKEND in order to load weights.
 You can find more results [here](https://github.com/huggingface/diffusers/pull/6276).

 ```py
 import diffusers
+diffusers.utils.USE_PEFT_BACKEND = False
 import torch
 from diffusers.utils import load_image
 import cv2
@@ -321,12 +321,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
            )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if latents is None:
            if device.type == "mps":
                # randn does not work reproducibly on mps
@@ -500,12 +500,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1,994 +0,0 @@
-import math
-import numbers
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AsymmetricAutoencoderKL, ImageProjection
-from diffusers.models.attention_processor import Attention, AttnProcessor
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
-    StableDiffusionInpaintPipeline,
-    retrieve_timesteps,
-)
-from diffusers.utils import deprecate
-
-
-class RASGAttnProcessor:
-    def __init__(self, mask, token_idx, scale_factor):
-        self.attention_scores = None  # Stores the last output of the similarity matrix here. Each layer will get its own RASGAttnProcessor assigned
-        self.mask = mask
-        self.token_idx = token_idx
-        self.scale_factor = scale_factor
-        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64 if the image is 512x512
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-    ) -> torch.Tensor:
-        # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
-        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        # Automatically recognize the resolution and save the attention similarity values
-        # We need to use the values before the softmax function, hence the rewritten get_attention_scores function.
-        if downscale_factor == self.scale_factor**2:
-            self.attention_scores = get_attention_scores(attn, query, key, attention_mask)
-            attention_probs = self.attention_scores.softmax(dim=-1)
-            attention_probs = attention_probs.to(query.dtype)
-        else:
-            attention_probs = attn.get_attention_scores(query, key, attention_mask)  # Original code
-
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class PAIntAAttnProcessor:
-    def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidance, scale_factors):
-        self.transformer_block = transformer_block  # Stores the parent transformer block.
-        self.mask = mask
-        self.scale_factors = scale_factors
-        self.do_classifier_free_guidance = do_classifier_free_guidance
-        self.token_idx = token_idx
-        self.shape = mask.shape[2:]
-        self.mask_resoltuion = mask.shape[-1] * mask.shape[-2]  # 64 x 64
-        self.default_processor = AttnProcessor()
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-    ) -> torch.Tensor:
-        # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
-        downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
-
-        mask = None
-        for factor in self.scale_factors:
-            if downscale_factor == factor**2:
-                shape = (self.shape[0] // factor, self.shape[1] // factor)
-                mask = F.interpolate(self.mask, shape, mode="bicubic")  # B, 1, H, W
-                break
-        if mask is None:
-            return self.default_processor(attn, hidden_states, encoder_hidden_states, attention_mask, temb, scale)
-
-        # STARTS HERE
-        residual = hidden_states
-        # Save the input hidden_states for later use
-        input_hidden_states = hidden_states
-
-        # ================================================== #
-        # =============== SELF ATTENTION 1 ================= #
-        # ================================================== #
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        # self_attention_probs = attn.get_attention_scores(query, key, attention_mask) # We can't use post-softmax attention scores in this case
-        self_attention_scores = get_attention_scores(
-            attn, query, key, attention_mask
-        )  # The custom function returns pre-softmax probabilities
-        self_attention_probs = self_attention_scores.softmax(
-            dim=-1
-        )  # Manually compute the probabilities here, the scores will be reused in the second part of PAIntA
-        self_attention_probs = self_attention_probs.to(query.dtype)
-
-        hidden_states = torch.bmm(self_attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        # x = x + self.attn1(self.norm1(x))
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:  # So many residuals everywhere
-            hidden_states = hidden_states + residual
-
-        self_attention_output_hidden_states = hidden_states / attn.rescale_output_factor
-
-        # ================================================== #
-        # ============ BasicTransformerBlock =============== #
-        # ================================================== #
-        # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
-        # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
-        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
-
-        # The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
-        # But the residual of the output is the non-normalized version.
-        # Therefore we unnormalize the input hidden state here
-        unnormalized_input_hidden_states = (
-            input_hidden_states + self.transformer_block.norm1.bias
-        ) * self.transformer_block.norm1.weight
-
-        # TODO: return if neccessary
-        # if self.use_ada_layer_norm_zero:
-        #     attn_output = gate_msa.unsqueeze(1) * attn_output
-        # elif self.use_ada_layer_norm_single:
-        #     attn_output = gate_msa * attn_output
-
-        transformer_hidden_states = self_attention_output_hidden_states + unnormalized_input_hidden_states
-        if transformer_hidden_states.ndim == 4:
-            transformer_hidden_states = transformer_hidden_states.squeeze(1)
-
-        # TODO: return if neccessary
-        # 2.5 GLIGEN Control
-        # if gligen_kwargs is not None:
-        #     transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
-        # NOTE: we experimented with using GLIGEN and HDPainter together, the results were not that great
-
-        # 3. Cross-Attention
-        if self.transformer_block.use_ada_layer_norm:
-            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, timestep)
-            raise NotImplementedError()
-        elif self.transformer_block.use_ada_layer_norm_zero or self.transformer_block.use_layer_norm:
-            transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states)
-        elif self.transformer_block.use_ada_layer_norm_single:
-            # For PixArt norm2 isn't applied here:
-            # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
-            transformer_norm_hidden_states = transformer_hidden_states
-        elif self.transformer_block.use_ada_layer_norm_continuous:
-            # transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, added_cond_kwargs["pooled_text_emb"])
-            raise NotImplementedError()
-        else:
-            raise ValueError("Incorrect norm")
-
-        if self.transformer_block.pos_embed is not None and self.transformer_block.use_ada_layer_norm_single is False:
-            transformer_norm_hidden_states = self.transformer_block.pos_embed(transformer_norm_hidden_states)
-
-        # ================================================== #
-        # ================= CROSS ATTENTION ================ #
-        # ================================================== #
-
-        # We do an initial pass of the CrossAttention up to obtaining the similarity matrix here.
-        # The similarity matrix is used to obtain scaling coefficients for the attention matrix of the self attention
-        # We reuse the previously computed self-attention matrix, and only repeat the steps after the softmax
-
-        cross_attention_input_hidden_states = (
-            transformer_norm_hidden_states  # Renaming the variable for the sake of readability
-        )
-
-        # TODO: check if classifier_free_guidance is being used before splitting here
-        if self.do_classifier_free_guidance:
-            # Our scaling coefficients depend only on the conditional part, so we split the inputs
-            (
-                _cross_attention_input_hidden_states_unconditional,
-                cross_attention_input_hidden_states_conditional,
-            ) = cross_attention_input_hidden_states.chunk(2)
-
-            # Same split for the encoder_hidden_states i.e. the tokens
-            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
-            _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
-                2
-            )
-        else:
-            cross_attention_input_hidden_states_conditional = cross_attention_input_hidden_states
-            encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(2)
-
-        # Rename the variables for the sake of readability
-        # The part below is the beginning of the __call__ function of the following CrossAttention layer
-        cross_attention_hidden_states = cross_attention_input_hidden_states_conditional
-        cross_attention_encoder_hidden_states = encoder_hidden_states_conditional
-
-        attn2 = self.transformer_block.attn2
-
-        if attn2.spatial_norm is not None:
-            cross_attention_hidden_states = attn2.spatial_norm(cross_attention_hidden_states, temb)
-
-        input_ndim = cross_attention_hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = cross_attention_hidden_states.shape
-            cross_attention_hidden_states = cross_attention_hidden_states.view(
-                batch_size, channel, height * width
-            ).transpose(1, 2)
-
-        (
-            batch_size,
-            sequence_length,
-            _,
-        ) = cross_attention_hidden_states.shape  # It is definitely a cross attention, so no need for an if block
-        # TODO: change the attention_mask here
-        attention_mask = attn2.prepare_attention_mask(
-            None, sequence_length, batch_size
-        )  # I assume the attention mask is the same...
-
-        if attn2.group_norm is not None:
-            cross_attention_hidden_states = attn2.group_norm(cross_attention_hidden_states.transpose(1, 2)).transpose(
-                1, 2
-            )
-
-        query2 = attn2.to_q(cross_attention_hidden_states)
-
-        if attn2.norm_cross:
-            cross_attention_encoder_hidden_states = attn2.norm_encoder_hidden_states(
-                cross_attention_encoder_hidden_states
-            )
-
-        key2 = attn2.to_k(cross_attention_encoder_hidden_states)
-        query2 = attn2.head_to_batch_dim(query2)
-        key2 = attn2.head_to_batch_dim(key2)
-
-        cross_attention_probs = attn2.get_attention_scores(query2, key2, attention_mask)
-
-        # CrossAttention ends here, the remaining part is not used
-
-        # ================================================== #
-        # ================ SELF ATTENTION 2 ================ #
-        # ================================================== #
-        # DEJA VU!
-
-        mask = (mask > 0.5).to(self_attention_output_hidden_states.dtype)
-        m = mask.to(self_attention_output_hidden_states.device)
-        # m = rearrange(m, 'b c h w -> b (h w) c').contiguous()
-        m = m.permute(0, 2, 3, 1).reshape((m.shape[0], -1, m.shape[1])).contiguous()  # B HW 1
-        m = torch.matmul(m, m.permute(0, 2, 1)) + (1 - m)
-
-        # # Compute scaling coefficients for the similarity matrix
-        # # Select the cross attention values for the correct tokens only!
-        # cross_attention_probs = cross_attention_probs.mean(dim = 0)
-        # cross_attention_probs = cross_attention_probs[:, self.token_idx].sum(dim=1)
-
-        # cross_attention_probs = cross_attention_probs.reshape(shape)
-        # gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(self_attention_output_hidden_states.device)
-        # cross_attention_probs = gaussian_smoothing(cross_attention_probs.unsqueeze(0))[0] # optional smoothing
-        # cross_attention_probs = cross_attention_probs.reshape(-1)
-        # cross_attention_probs = ((cross_attention_probs - torch.median(cross_attention_probs.ravel())) / torch.max(cross_attention_probs.ravel())).clip(0, 1)
-
-        # c = (1 - m) * cross_attention_probs.reshape(1, 1, -1) + m # PAIntA scaling coefficients
-
-        # Compute scaling coefficients for the similarity matrix
-        # Select the cross attention values for the correct tokens only!
-
-        batch_size, dims, channels = cross_attention_probs.shape
-        batch_size = batch_size // attn.heads
-        cross_attention_probs = cross_attention_probs.reshape((batch_size, attn.heads, dims, channels))  # B, D, HW, T
-
-        cross_attention_probs = cross_attention_probs.mean(dim=1)  # B, HW, T
-        cross_attention_probs = cross_attention_probs[..., self.token_idx].sum(dim=-1)  # B, HW
-        cross_attention_probs = cross_attention_probs.reshape((batch_size,) + shape)  # , B, H, W
-
-        gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(
-            self_attention_output_hidden_states.device
-        )
-        cross_attention_probs = gaussian_smoothing(cross_attention_probs[:, None])[:, 0]  # optional smoothing B, H, W
-
-        # Median normalization
-        cross_attention_probs = cross_attention_probs.reshape(batch_size, -1)  # B, HW
-        cross_attention_probs = (
-            cross_attention_probs - cross_attention_probs.median(dim=-1, keepdim=True).values
-        ) / cross_attention_probs.max(dim=-1, keepdim=True).values
-        cross_attention_probs = cross_attention_probs.clip(0, 1)
-
-        c = (1 - m) * cross_attention_probs.reshape(batch_size, 1, -1) + m
-        c = c.repeat_interleave(attn.heads, 0)  # BD, HW
-        if self.do_classifier_free_guidance:
-            c = torch.cat([c, c])  # 2BD, HW
-
-        # Rescaling the original self-attention matrix
-        self_attention_scores_rescaled = self_attention_scores * c
-        self_attention_probs_rescaled = self_attention_scores_rescaled.softmax(dim=-1)
-
-        # Continuing the self attention normally using the new matrix
-        hidden_states = torch.bmm(self_attention_probs_rescaled, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + input_hidden_states
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
-    def get_tokenized_prompt(self, prompt):
-        out = self.tokenizer(prompt)
-        return [self.tokenizer.decode(x) for x in out["input_ids"]]
-
-    def init_attn_processors(
-        self,
-        mask,
-        token_idx,
-        use_painta=True,
-        use_rasg=True,
-        painta_scale_factors=[2, 4],  # 64x64 -> [16x16, 32x32]
-        rasg_scale_factor=4,  # 64x64 -> 16x16
-        self_attention_layer_name="attn1",
-        cross_attention_layer_name="attn2",
-        list_of_painta_layer_names=None,
-        list_of_rasg_layer_names=None,
-    ):
-        default_processor = AttnProcessor()
-        width, height = mask.shape[-2:]
-        width, height = width // self.vae_scale_factor, height // self.vae_scale_factor
-
-        painta_scale_factors = [x * self.vae_scale_factor for x in painta_scale_factors]
-        rasg_scale_factor = self.vae_scale_factor * rasg_scale_factor
-
-        attn_processors = {}
-        for x in self.unet.attn_processors:
-            if (list_of_painta_layer_names is None and self_attention_layer_name in x) or (
-                list_of_painta_layer_names is not None and x in list_of_painta_layer_names
-            ):
-                if use_painta:
-                    transformer_block = self.unet.get_submodule(x.replace(".attn1.processor", ""))
-                    attn_processors[x] = PAIntAAttnProcessor(
-                        transformer_block, mask, token_idx, self.do_classifier_free_guidance, painta_scale_factors
-                    )
-                else:
-                    attn_processors[x] = default_processor
-            elif (list_of_rasg_layer_names is None and cross_attention_layer_name in x) or (
-                list_of_rasg_layer_names is not None and x in list_of_rasg_layer_names
-            ):
-                if use_rasg:
-                    attn_processors[x] = RASGAttnProcessor(mask, token_idx, rasg_scale_factor)
-                else:
-                    attn_processors[x] = default_processor
-
-        self.unet.set_attn_processor(attn_processors)
-        # import json
-        # with open('/home/hayk.manukyan/repos/diffusers/debug.txt', 'a')  as f:
-        #     json.dump({x:str(y) for x,y in self.unet.attn_processors.items()}, f, indent=4)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: PipelineImageInput = None,
-        mask_image: PipelineImageInput = None,
-        masked_image_latents: torch.FloatTensor = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        padding_mask_crop: Optional[int] = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.5,
-        positive_prompt: Optional[str] = "",
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.01,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: int = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        use_painta=True,
-        use_rasg=True,
-        self_attention_layer_name=".attn1",
-        cross_attention_layer_name=".attn2",
-        painta_scale_factors=[2, 4],  # 16 x 16 and 32 x 32
-        rasg_scale_factor=4,  # 16x16 by default
-        list_of_painta_layer_names=None,
-        list_of_rasg_layer_names=None,
-        **kwargs,
-    ):
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        #
-        prompt_no_positives = prompt
-        if isinstance(prompt, list):
-            prompt = [x + positive_prompt for x in prompt]
-        else:
-            prompt = prompt + positive_prompt
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            image,
-            mask_image,
-            height,
-            width,
-            strength,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            callback_on_step_end_tensor_inputs,
-            padding_mask_crop,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # assert batch_size == 1, "Does not work with batch size > 1 currently"
-
-        device = self._execution_device
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            self.do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            clip_skip=self.clip_skip,
-        )
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
-        # 4. set timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-        timesteps, num_inference_steps = self.get_timesteps(
-            num_inference_steps=num_inference_steps, strength=strength, device=device
-        )
-        # check that number of inference steps is not < 1 - as this doesn't make sense
-        if num_inference_steps < 1:
-            raise ValueError(
-                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
-                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
-            )
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Preprocess mask and image
-
-        if padding_mask_crop is not None:
-            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
-            resize_mode = "fill"
-        else:
-            crops_coords = None
-            resize_mode = "default"
-
-        original_image = image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
-        )
-        init_image = init_image.to(dtype=torch.float32)
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        num_channels_unet = self.unet.config.in_channels
-        return_image_latents = num_channels_unet == 4
-
-        latents_outputs = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=return_image_latents,
-        )
-
-        if return_image_latents:
-            latents, noise, image_latents = latents_outputs
-        else:
-            latents, noise = latents_outputs
-
-        # 7. Prepare mask latent variables
-        mask_condition = self.mask_processor.preprocess(
-            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
-        )
-
-        if masked_image_latents is None:
-            masked_image = init_image * (mask_condition < 0.5)
-        else:
-            masked_image = masked_image_latents
-
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask_condition,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            self.do_classifier_free_guidance,
-        )
-
-        # 7.5 Setting up HD-Painter
-
-        # Get the indices of the tokens to be modified by both RASG and PAIntA
-        token_idx = list(range(1, self.get_tokenized_prompt(prompt_no_positives).index("<|endoftext|>"))) + [
-            self.get_tokenized_prompt(prompt).index("<|endoftext|>")
-        ]
-
-        # Setting up the attention processors
-        self.init_attn_processors(
-            mask_condition,
-            token_idx,
-            use_painta,
-            use_rasg,
-            painta_scale_factors=painta_scale_factors,
-            rasg_scale_factor=rasg_scale_factor,
-            self_attention_layer_name=self_attention_layer_name,
-            cross_attention_layer_name=cross_attention_layer_name,
-            list_of_painta_layer_names=list_of_painta_layer_names,
-            list_of_rasg_layer_names=list_of_rasg_layer_names,
-        )
-
-        # 8. Check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
-            )
-
-        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if use_rasg:
-            extra_step_kwargs["generator"] = None
-
-        # 9.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-        # 9.2 Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
-        # 10. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-        painta_active = True
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                if t < 500 and painta_active:
-                    self.init_attn_processors(
-                        mask_condition,
-                        token_idx,
-                        False,
-                        use_rasg,
-                        painta_scale_factors=painta_scale_factors,
-                        rasg_scale_factor=rasg_scale_factor,
-                        self_attention_layer_name=self_attention_layer_name,
-                        cross_attention_layer_name=cross_attention_layer_name,
-                        list_of_painta_layer_names=list_of_painta_layer_names,
-                        list_of_rasg_layer_names=list_of_rasg_layer_names,
-                    )
-                    painta_active = False
-
-                with torch.enable_grad():
-                    self.unet.zero_grad()
-                    latents = latents.detach()
-                    latents.requires_grad = True
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-
-                    # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    if num_channels_unet == 9:
-                        latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
-
-                    self.scheduler.latents = latents
-                    self.encoder_hidden_states = prompt_embeds
-                    for attn_processor in self.unet.attn_processors.values():
-                        attn_processor.encoder_hidden_states = prompt_embeds
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        timestep_cond=timestep_cond,
-                        cross_attention_kwargs=self.cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                        return_dict=False,
-                    )[0]
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    if use_rasg:
-                        # Perform RASG
-                        _, _, height, width = mask_condition.shape  # 512 x 512
-                        scale_factor = self.vae_scale_factor * rasg_scale_factor  # 8 * 4 = 32
-
-                        # TODO: Fix for > 1 batch_size
-                        rasg_mask = F.interpolate(
-                            mask_condition, (height // scale_factor, width // scale_factor), mode="bicubic"
-                        )[0, 0]  # mode is nearest by default, B, H, W
-
-                        # Aggregate the saved attention maps
-                        attn_map = []
-                        for processor in self.unet.attn_processors.values():
-                            if hasattr(processor, "attention_scores") and processor.attention_scores is not None:
-                                if self.do_classifier_free_guidance:
-                                    attn_map.append(processor.attention_scores.chunk(2)[1])  # (B/2) x H, 256, 77
-                                else:
-                                    attn_map.append(processor.attention_scores)  # B x H, 256, 77 ?
-
-                        attn_map = (
-                            torch.cat(attn_map)
-                            .mean(0)
-                            .permute(1, 0)
-                            .reshape((-1, height // scale_factor, width // scale_factor))
-                        )  # 77, 16, 16
-
-                        # Compute the attention score
-                        attn_score = -sum(
-                            [
-                                F.binary_cross_entropy_with_logits(x - 1.0, rasg_mask.to(device))
-                                for x in attn_map[token_idx]
-                            ]
-                        )
-
-                        # Backward the score and compute the gradients
-                        attn_score.backward()
-
-                        # Normalzie the gradients and compute the noise component
-                        variance_noise = latents.grad.detach()
-                        # print("VARIANCE SHAPE", variance_noise.shape)
-                        variance_noise -= torch.mean(variance_noise, [1, 2, 3], keepdim=True)
-                        variance_noise /= torch.std(variance_noise, [1, 2, 3], keepdim=True)
-                    else:
-                        variance_noise = None
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False, variance_noise=variance_noise
-                )[0]
-
-                if num_channels_unet == 4:
-                    init_latents_proper = image_latents
-                    if self.do_classifier_free_guidance:
-                        init_mask, _ = mask.chunk(2)
-                    else:
-                        init_mask = mask
-
-                    if i < len(timesteps) - 1:
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_proper, noise, torch.tensor([noise_timestep])
-                        )
-
-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    mask = callback_outputs.pop("mask", mask)
-                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, t, latents)
-
-        if not output_type == "latent":
-            condition_kwargs = {}
-            if isinstance(self.vae, AsymmetricAutoencoderKL):
-                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
-                init_image_condition = init_image.clone()
-                init_image = self._encode_vae_image(init_image, generator=generator)
-                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
-                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
-            image = self.vae.decode(
-                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
-            )[0]
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if padding_mask_crop is not None:
-            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-
-# ============= Utility Functions ============== #
-
-
-class GaussianSmoothing(nn.Module):
-    """
-    Apply gaussian smoothing on a
-    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
-    in the input using a depthwise convolution.
-    Arguments:
-        channels (int, sequence): Number of channels of the input tensors. Output will
-            have this number of channels as well.
-        kernel_size (int, sequence): Size of the gaussian kernel.
-        sigma (float, sequence): Standard deviation of the gaussian kernel.
-        dim (int, optional): The number of dimensions of the data.
-            Default value is 2 (spatial).
-    """
-
-    def __init__(self, channels, kernel_size, sigma, dim=2):
-        super(GaussianSmoothing, self).__init__()
-        if isinstance(kernel_size, numbers.Number):
-            kernel_size = [kernel_size] * dim
-        if isinstance(sigma, numbers.Number):
-            sigma = [sigma] * dim
-
-        # The gaussian kernel is the product of the
-        # gaussian function of each dimension.
-        kernel = 1
-        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
-        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
-            mean = (size - 1) / 2
-            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
-
-        # Make sure sum of values in gaussian kernel equals 1.
-        kernel = kernel / torch.sum(kernel)
-
-        # Reshape to depthwise convolutional weight
-        kernel = kernel.view(1, 1, *kernel.size())
-        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
-
-        self.register_buffer("weight", kernel)
-        self.groups = channels
-
-        if dim == 1:
-            self.conv = F.conv1d
-        elif dim == 2:
-            self.conv = F.conv2d
-        elif dim == 3:
-            self.conv = F.conv3d
-        else:
-            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
-
-    def forward(self, input):
-        """
-        Apply gaussian filter to input.
-        Arguments:
-            input (torch.Tensor): Input to apply gaussian filter on.
-        Returns:
-            filtered (torch.Tensor): Filtered output.
-        """
-        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups, padding="same")
-
-
-def get_attention_scores(
-    self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
-) -> torch.Tensor:
-    r"""
-    Compute the attention scores.
-
-    Args:
-        query (`torch.Tensor`): The query tensor.
-        key (`torch.Tensor`): The key tensor.
-        attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
-
-    Returns:
-        `torch.Tensor`: The attention probabilities/scores.
-    """
-    if self.upcast_attention:
-        query = query.float()
-        key = key.float()
-
-    if attention_mask is None:
-        baddbmm_input = torch.empty(
-            query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
-        )
-        beta = 0
-    else:
-        baddbmm_input = attention_mask
-        beta = 1
-
-    attention_scores = torch.baddbmm(
-        baddbmm_input,
-        query,
-        key.transpose(-1, -2),
-        beta=beta,
-        alpha=self.scale,
-    )
-    del baddbmm_input
-
-    if self.upcast_softmax:
-        attention_scores = attention_scores.float()
-
-    return attention_scores
@@ -468,12 +468,7 @@ class InstaFlowPipeline(
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -26,14 +26,7 @@ from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    IPAdapterAttnProcessor,
-    IPAdapterAttnProcessor2_0,
-)
-from diffusers.models.embeddings import MultiIPAdapterImageProjection
-from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -52,6 +45,300 @@ from diffusers.utils.torch_utils import randn_tensor
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


+class LoRAIPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        lora_scale (`float`, defaults to 1.0):
+            the weight scale of LoRA.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+        scale=1.0,
+        num_tokens=4,
+    ):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAIPAdapterAttnProcessor2_0(nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        lora_scale (`float`, defaults to 1.0):
+            the weight scale of LoRA.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+        scale=1.0,
+        num_tokens=4,
+    ):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
 class IPAdapterFullImageProjection(nn.Module):
    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
        super().__init__()
@@ -328,13 +615,17 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        return image_projection

    def _load_ip_adapter_weights(self, state_dict):
+        from diffusers.models.attention_processor import (
+            AttnProcessor,
+            AttnProcessor2_0,
+        )
+
        num_image_text_embeds = 4

        self.unet.encoder_hid_proj = None

        # set ip-adapter cross-attention processors & load state_dict
        attn_procs = {}
-        lora_dict = {}
        key_id = 0
        for name in self.unet.attn_processors.keys():
            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
@@ -351,99 +642,94 @@ class IPAdapterFaceIDStableDiffusionPipeline(
                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
                )
                attn_procs[name] = attn_processor_class()
+                rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
+                attn_module = self.unet
+                for n in name.split(".")[:-1]:
+                    attn_module = getattr(attn_module, n)
+                # Set the `lora_layer` attribute of the attention-related matrices.
+                attn_module.to_q.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_q.in_features,
+                        out_features=attn_module.to_q.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_k.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_k.in_features,
+                        out_features=attn_module.to_k.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_v.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_v.in_features,
+                        out_features=attn_module.to_v.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_out[0].set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_out[0].in_features,
+                        out_features=attn_module.to_out[0].out_features,
+                        rank=rank,
+                    )
+                )

-                lora_dict.update(
-                    {f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {
-                        f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
-                            f"{key_id}.to_out_lora.down.weight"
-                        ]
-                    }
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
-                )
+                value_dict = {}
+                for k, module in attn_module.named_children():
+                    index = "."
+                    if not hasattr(module, "set_lora_layer"):
+                        index = ".0."
+                        module = module[0]
+                    lora_layer = getattr(module, "lora_layer")
+                    for lora_name, w in lora_layer.state_dict().items():
+                        value_dict.update(
+                            {
+                                f"{k}{index}lora_layer.{lora_name}": state_dict["ip_adapter"][
+                                    f"{key_id}.{k}_lora.{lora_name}"
+                                ]
+                            }
+                        )
+
+                attn_module.load_state_dict(value_dict, strict=False)
+                attn_module.to(dtype=self.dtype, device=self.device)
                key_id += 1
            else:
+                rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
                attn_processor_class = (
-                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                    LoRAIPAdapterAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else LoRAIPAdapterAttnProcessor
                )
                attn_procs[name] = attn_processor_class(
                    hidden_size=hidden_size,
                    cross_attention_dim=cross_attention_dim,
                    scale=1.0,
+                    rank=rank,
                    num_tokens=num_image_text_embeds,
                ).to(dtype=self.dtype, device=self.device)

-                lora_dict.update(
-                    {f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
-                )
-                lora_dict.update(
-                    {
-                        f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
-                            f"{key_id}.to_out_lora.down.weight"
-                        ]
-                    }
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
-                )
-                lora_dict.update(
-                    {f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
-                )
-
                value_dict = {}
-                value_dict.update({"to_k_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
-                value_dict.update({"to_v_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                for k, w in attn_procs[name].state_dict().items():
+                    value_dict.update({f"{k}": state_dict["ip_adapter"][f"{key_id}.{k}"]})
+
                attn_procs[name].load_state_dict(value_dict)
                key_id += 1

        self.unet.set_attn_processor(attn_procs)

-        self.load_lora_weights(lora_dict, adapter_name="faceid")
-        self.set_adapters(["faceid"], adapter_weights=[1.0])
-
        # convert IP-Adapter Image Projection layers to diffusers
        image_projection = self.convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
-        image_projection_layers = [image_projection.to(device=self.device, dtype=self.dtype)]

-        self.unet.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.unet.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
        self.unet.config.encoder_hid_dim_type = "ip_image_proj"

    def set_ip_adapter_scale(self, scale):
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        for attn_processor in unet.attn_processors.values():
-            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
-                attn_processor.scale = [scale]
+            if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)):
+                attn_processor.scale = scale

    def _encode_prompt(
        self,
@@ -753,12 +1039,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1017,7 +1298,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
            negative_image_embeds = torch.zeros_like(image_embeds)
            if self.do_classifier_free_guidance:
                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-        image_embeds = [image_embeds]
+
        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)

@@ -1038,7 +1319,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else {}
+        added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else None

        # 6.2 Optionally get Guidance Scale Embedding
        timestep_cond = None
@@ -177,12 +177,7 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
        latents=None,
        generator=None,
    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)

        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
            raise ValueError(
@@ -472,12 +472,7 @@ class LatentConsistencyModelWalkPipeline(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -163,12 +163,7 @@ class LatentConsistencyModelPipeline(DiffusionPipeline):
        return image, has_nsfw_concept

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if latents is None:
            latents = torch.randn(shape, dtype=dtype).to(device)
        else:
@@ -439,9 +439,7 @@ class StableDiffusionLongPromptWeightingPipeline(
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

-    model_cpu_offload_seq = "text_encoder-->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
-    _exclude_from_cpu_offload = ["safety_checker"]

    def __init__(
        self,
@@ -726,12 +724,7 @@ class StableDiffusionLongPromptWeightingPipeline(
    ):
        if image is None:
            batch_size = batch_size * num_images_per_prompt
-            shape = (
-                batch_size,
-                num_channels_latents,
-                int(height) // self.vae_scale_factor,
-                int(width) // self.vae_scale_factor,
-            )
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1060,12 +1060,7 @@ class SDXLLongPromptWeightingPipeline(
        batch_size *= num_images_per_prompt

        if image is None:
-            shape = (
-                batch_size,
-                num_channels_latents,
-                int(height) // self.vae_scale_factor,
-                int(width) // self.vae_scale_factor,
-            )
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1145,12 +1140,7 @@ class SDXLLongPromptWeightingPipeline(
            return latents

        else:
-            shape = (
-                batch_size,
-                num_channels_latents,
-                int(height) // self.vae_scale_factor,
-                int(width) // self.vae_scale_factor,
-            )
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -373,29 +373,18 @@ class AnimateDiffControlNetPipeline(
        return prompt_embeds, negative_prompt_embeds

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+    def encode_image(self, image, device, num_images_per_prompt):
        dtype = next(self.image_encoder.parameters()).dtype

        if not isinstance(image, torch.Tensor):
            image = self.feature_extractor(image, return_tensors="pt").pixel_values

        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)

-            return image_embeds, uncond_image_embeds
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
    def prepare_ip_adapter_image_embeds(
@@ -477,12 +477,7 @@ class DemoFusionSDXLPipeline(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -151,7 +151,7 @@ def concat_first(feat: torch.Tensor, dim: int = 2, scale: float = 1.0) -> torch.
    return torch.cat((feat, feat_style), dim=dim)


-def calc_mean_std(feat: torch.Tensor, eps: float = 1e-5) -> Tuple[torch.Tensor, torch.Tensor]:
+def calc_mean_std(feat: torch.Tensor, eps: float = 1e-5) -> tuple[torch.Tensor, torch.Tensor]:
    feat_std = (feat.var(dim=-2, keepdims=True) + eps).sqrt()
    feat_mean = feat.mean(dim=-2, keepdims=True)
    return feat_mean, feat_std
@@ -919,12 +919,7 @@ class StyleAlignedSDXLPipeline(
        batch_size *= num_images_per_prompt

        if image is None:
-            shape = (
-                batch_size,
-                num_channels_latents,
-                int(height) // self.vae_scale_factor,
-                int(width) // self.vae_scale_factor,
-            )
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1004,12 +999,7 @@ class StyleAlignedSDXLPipeline(
            return latents

        else:
-            shape = (
-                batch_size,
-                num_channels_latents,
-                int(height) // self.vae_scale_factor,
-                int(width) // self.vae_scale_factor,
-            )
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -857,12 +857,7 @@ class StableDiffusionPAGPipeline(
            )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -751,12 +751,7 @@ class StableDiffusionXLControlNetAdapterPipeline(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -17,7 +17,7 @@

 import inspect
 from collections.abc import Callable
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union

 import numpy as np
 import PIL
@@ -1211,8 +1211,8 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt: Optional[Union[str, list[str]]] = None,
+        prompt_2: Optional[Union[str, list[str]]] = None,
        image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None,
        mask_image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None,
        adapter_image: PipelineImageInput = None,
@@ -1224,11 +1224,11 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
        denoising_start: Optional[float] = None,
        denoising_end: Optional[float] = None,
        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, list[str]]] = None,
+        negative_prompt_2: Optional[Union[str, list[str]]] = None,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
        latents: Optional[Union[torch.FloatTensor]] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -1238,12 +1238,12 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        cross_attention_kwargs: Optional[dict[str, Any]] = None,
        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Optional[Tuple[int, int]] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        adapter_conditioning_scale: Optional[Union[float, List[float]]] = 1.0,
+        original_size: Optional[tuple[int, int]] = None,
+        crops_coords_top_left: Optional[tuple[int, int]] = (0, 0),
+        target_size: Optional[tuple[int, int]] = None,
+        adapter_conditioning_scale: Optional[Union[float, list[float]]] = 1.0,
        cond_tau: float = 1.0,
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
@@ -614,12 +614,7 @@ class StableDiffusionXLPipelineIpex(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -497,12 +497,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
            )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -635,12 +635,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio
            )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -533,12 +533,7 @@ class StableDiffusionIPEXPipeline(
                )

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -609,12 +609,7 @@ class StableDiffusionReferencePipeline(
        Returns:
            torch.Tensor: The prepared latent vectors.
        """
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -23,7 +23,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -239,10 +238,6 @@ class SDText2ImageDataset:

 def log_validation(vae, unet, args, accelerator, weight_dtype, step):
    logger.info("Running validation... ")
-    if torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)

    unet = accelerator.unwrap_model(unet)
    pipeline = StableDiffusionPipeline.from_pretrained(
@@ -279,7 +274,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with autocast_ctx:
+        with torch.autocast("cuda", dtype=weight_dtype):
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -1177,11 +1172,6 @@ def main(args):
    ).input_ids.to(accelerator.device)
    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]

-    if torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type)
-
    # 16. Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1310,7 +1300,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    with autocast_ctx:
+                    with torch.autocast("cuda"):
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1369,7 +1359,7 @@ def main(args):
                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
-                    with autocast_ctx:
+                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = unet(
                            x_prev.float(),
                            timesteps,
@@ -22,7 +22,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -147,12 +146,7 @@ def log_validation(vae, args, accelerator, weight_dtype, step, unet=None, is_fin

    for _, prompt in enumerate(validation_prompts):
        images = []
-        if torch.backends.mps.is_available():
-            autocast_ctx = nullcontext()
-        else:
-            autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
-
-        with autocast_ctx:
+        with torch.autocast("cuda", dtype=weight_dtype):
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -24,7 +24,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -257,10 +256,6 @@ class SDXLText2ImageDataset:

 def log_validation(vae, unet, args, accelerator, weight_dtype, step):
    logger.info("Running validation... ")
-    if torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)

    unet = accelerator.unwrap_model(unet)
    pipeline = StableDiffusionXLPipeline.from_pretrained(
@@ -296,7 +291,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):

    for _, prompt in enumerate(validation_prompts):
        images = []
-        with autocast_ctx:
+        with torch.autocast("cuda", dtype=weight_dtype):
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -1358,12 +1353,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda"):
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1426,12 +1416,7 @@ def main(args):
                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
-                    if torch.backends.mps.is_available():
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda", enabled=True, dtype=weight_dtype):
                        target_noise_pred = unet(
                            x_prev.float(),
                            timesteps,
@@ -23,7 +23,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -253,12 +252,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

    for _, prompt in enumerate(validation_prompts):
        images = []
-        if torch.backends.mps.is_available():
-            autocast_ctx = nullcontext()
-        else:
-            autocast_ctx = torch.autocast(accelerator.device.type)
-
-        with autocast_ctx:
+        with torch.autocast("cuda"):
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -945,7 +939,7 @@ def main(args):

    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
    # Initialize from (online) unet
-    target_unet = UNet2DConditionModel.from_config(unet.config)
+    target_unet = UNet2DConditionModel(**teacher_unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
    target_unet.requires_grad_(False)
@@ -1263,12 +1257,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    if torch.backends.mps.is_available():
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda"):
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1326,12 +1315,7 @@ def main(args):

                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
-                    if torch.backends.mps.is_available():
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = target_unet(
                            x_prev.float(),
                            timesteps,
@@ -24,7 +24,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Union

@@ -271,12 +270,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe

    for _, prompt in enumerate(validation_prompts):
        images = []
-        if torch.backends.mps.is_available():
-            autocast_ctx = nullcontext()
-        else:
-            autocast_ctx = torch.autocast(accelerator.device.type)
-
-        with autocast_ctx:
+        with torch.autocast("cuda"):
            images = pipeline(
                prompt=prompt,
                num_inference_steps=4,
@@ -1004,7 +998,7 @@ def main(args):

    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
    # Initialize from (online) unet
-    target_unet = UNet2DConditionModel.from_config(unet.config)
+    target_unet = UNet2DConditionModel(**teacher_unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
    target_unet.requires_grad_(False)
@@ -1361,12 +1355,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    if torch.backends.mps.is_available():
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda"):
                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
@@ -1428,12 +1417,7 @@ def main(args):

                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
-                    if torch.backends.mps.is_available():
-                        autocast_ctx = nullcontext()
-                    else:
-                        autocast_ctx = torch.autocast(accelerator.device.type, dtype=weight_dtype)
-
-                    with autocast_ctx:
+                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = target_unet(
                            x_prev.float(),
                            timesteps,
@@ -752,10 +752,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and

 import argparse
+import contextlib
 import functools
 import gc
 import logging
@@ -21,7 +22,6 @@ import math
 import os
 import random
 import shutil
-from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -125,10 +125,11 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
        )

    image_logs = []
-    if is_final_validation or torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type)
+    inference_ctx = (
+        contextlib.nullcontext()
+        if (is_final_validation or torch.backends.mps.is_available())
+        else torch.autocast("cuda")
+    )

    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
        validation_image = Image.open(validation_image).convert("RGB")
@@ -137,7 +138,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
        images = []

        for _ in range(args.num_validation_images):
-            with autocast_ctx:
+            with inference_ctx:
                image = pipeline(
                    prompt=validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
                ).images[0]
@@ -810,10 +811,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -676,10 +676,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -821,10 +821,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -749,10 +749,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -23,7 +23,6 @@ import os
 import random
 import shutil
 import warnings
-from contextlib import nullcontext
 from pathlib import Path

 import numpy as np
@@ -208,12 +207,18 @@ def log_validation(
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
    # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better
    # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
-    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type)
+    enable_autocast = True
+    if torch.backends.mps.is_available() or (
+        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
+    ):
+        enable_autocast = False
+    if "playground" in args.pretrained_model_name_or_path:
+        enable_autocast = False

-    with autocast_ctx:
+    with torch.autocast(
+        accelerator.device.type,
+        enabled=enable_autocast,
+    ):
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]

    for tracker in accelerator.trackers:
@@ -987,10 +992,6 @@ def main(args):
        kwargs_handlers=[kwargs],
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -21,7 +21,6 @@ import logging
 import math
 import os
 import shutil
-from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -53,9 +52,6 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module


-if is_wandb_available():
-    import wandb
-
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.28.0.dev0")

@@ -67,48 +63,6 @@ DATASET_NAME_MAPPING = {
 WANDB_TABLE_COL_NAMES = ["original_image", "edited_image", "edit_prompt"]


-def log_validation(
-    pipeline,
-    args,
-    accelerator,
-    generator,
-):
-    logger.info(
-        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-        f" {args.validation_prompt}."
-    )
-    pipeline = pipeline.to(accelerator.device)
-    pipeline.set_progress_bar_config(disable=True)
-
-    # run inference
-    original_image = download_image(args.val_image_url)
-    edited_images = []
-    if torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type)
-
-    with autocast_ctx:
-        for _ in range(args.num_validation_images):
-            edited_images.append(
-                pipeline(
-                    args.validation_prompt,
-                    image=original_image,
-                    num_inference_steps=20,
-                    image_guidance_scale=1.5,
-                    guidance_scale=7,
-                    generator=generator,
-                ).images[0]
-            )
-
-    for tracker in accelerator.trackers:
-        if tracker.name == "wandb":
-            wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
-            for edited_image in edited_images:
-                wandb_table.add_data(wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt)
-            tracker.log({"validation": wandb_table})
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script for InstructPix2Pix.")
    parser.add_argument(
@@ -450,12 +404,13 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -557,8 +512,7 @@ def main():
                    model.save_pretrained(os.path.join(output_dir, "unet"))

                    # make sure to pop weight so that corresponding model is not saved again
-                    if weights:
-                        weights.pop()
+                    weights.pop()

        def load_model_hook(models, input_dir):
            if args.use_ema:
@@ -964,6 +918,11 @@ def main():
                and (args.validation_prompt is not None)
                and (epoch % args.validation_epochs == 0)
            ):
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
                if args.use_ema:
                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
                    ema_unet.store(unet.parameters())
@@ -978,14 +937,35 @@ def main():
                    variant=args.variant,
                    torch_dtype=weight_dtype,
                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)

-                log_validation(
-                    pipeline,
-                    args,
-                    accelerator,
-                    generator,
-                )
+                # run inference
+                original_image = download_image(args.val_image_url)
+                edited_images = []
+                with torch.autocast(
+                    str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"
+                ):
+                    for _ in range(args.num_validation_images):
+                        edited_images.append(
+                            pipeline(
+                                args.validation_prompt,
+                                image=original_image,
+                                num_inference_steps=20,
+                                image_guidance_scale=1.5,
+                                guidance_scale=7,
+                                generator=generator,
+                            ).images[0]
+                        )

+                for tracker in accelerator.trackers:
+                    if tracker.name == "wandb":
+                        wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+                        for edited_image in edited_images:
+                            wandb_table.add_data(
+                                wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
+                            )
+                        tracker.log({"validation": wandb_table})
                if args.use_ema:
                    # Switch back to the original UNet parameters.
                    ema_unet.restore(unet.parameters())
@@ -996,6 +976,7 @@ def main():
    # Create the pipeline using the trained modules and save it.
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
        if args.use_ema:
            ema_unet.copy_to(unet.parameters())

@@ -1003,7 +984,7 @@ def main():
            args.pretrained_model_name_or_path,
            text_encoder=unwrap_model(text_encoder),
            vae=unwrap_model(vae),
-            unet=unwrap_model(unet),
+            unet=unet,
            revision=args.revision,
            variant=args.variant,
        )
@@ -1017,13 +998,31 @@ def main():
                ignore_patterns=["step_*", "epoch_*"],
            )

-        if (args.val_image_url is not None) and (args.validation_prompt is not None):
-            log_validation(
-                pipeline,
-                args,
-                accelerator,
-                generator,
-            )
+        if args.validation_prompt is not None:
+            edited_images = []
+            pipeline = pipeline.to(accelerator.device)
+            with torch.autocast(str(accelerator.device).replace(":0", "")):
+                for _ in range(args.num_validation_images):
+                    edited_images.append(
+                        pipeline(
+                            args.validation_prompt,
+                            image=original_image,
+                            num_inference_steps=20,
+                            image_guidance_scale=1.5,
+                            guidance_scale=7,
+                            generator=generator,
+                        ).images[0]
+                    )
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "wandb":
+                    wandb_table = wandb.Table(columns=WANDB_TABLE_COL_NAMES)
+                    for edited_image in edited_images:
+                        wandb_table.add_data(
+                            wandb.Image(original_image), wandb.Image(edited_image), args.validation_prompt
+                        )
+                    tracker.log({"test": wandb_table})
+
    accelerator.end_training()


@@ -20,7 +20,6 @@ import math
 import os
 import shutil
 import warnings
-from contextlib import nullcontext
 from pathlib import Path
 from urllib.parse import urlparse

@@ -71,7 +70,9 @@ WANDB_TABLE_COL_NAMES = ["file_name", "edited_image", "edit_prompt"]
 TORCH_DTYPE_MAPPING = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}


-def log_validation(pipeline, args, accelerator, generator, global_step, is_final_validation=False):
+def log_validation(
+    pipeline, args, accelerator, generator, global_step, is_final_validation=False, enable_autocast=True
+):
    logger.info(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
        f" {args.validation_prompt}."
@@ -90,12 +91,7 @@ def log_validation(pipeline, args, accelerator, generator, global_step, is_final
        else Image.open(image_url_or_path).convert("RGB")
    )(args.val_image_url_or_path)

-    if torch.backends.mps.is_available():
-        autocast_ctx = nullcontext()
-    else:
-        autocast_ctx = torch.autocast(accelerator.device.type)
-
-    with autocast_ctx:
+    with torch.autocast(accelerator.device.type, enabled=enable_autocast):
        edited_images = []
        # Run inference
        for val_img_idx in range(args.num_validation_images):
@@ -511,10 +507,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

    # Make one log on every process with the configuration for debugging.
@@ -991,6 +983,13 @@ def main():
    if accelerator.is_main_process:
        accelerator.init_trackers("instruct-pix2pix-xl", config=vars(args))

+    # Some configurations require autocast to be disabled.
+    enable_autocast = True
+    if torch.backends.mps.is_available() or (
+        accelerator.mixed_precision == "fp16" or accelerator.mixed_precision == "bf16"
+    ):
+        enable_autocast = False
+
    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

@@ -1203,6 +1202,7 @@ def main():
                        generator,
                        global_step,
                        is_final_validation=False,
+                        enable_autocast=enable_autocast,
                    )

                    if args.use_ema:
@@ -1252,6 +1252,7 @@ def main():
                generator,
                global_step,
                is_final_validation=True,
+                enable_autocast=enable_autocast,
            )

    accelerator.end_training()
@@ -458,10 +458,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -343,11 +343,6 @@ def main():
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
-
-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -356,11 +356,6 @@ def main():
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
-
-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -459,10 +459,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -916,10 +916,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -1,15 +1,3 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
 # ControlNet-XS

 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
@@ -24,16 +12,5 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe

 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️

-<Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionControlNetXSPipeline
-[[autodoc]] StableDiffusionControlNetXSPipeline
-	- all
-	- __call__
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
+> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
@@ -1,15 +1,3 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
 # ControlNet-XS with Stable Diffusion XL

 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
@@ -24,22 +12,4 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe

 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️

-<Tip warning={true}>
-
-🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionXLControlNetXSPipeline
-[[autodoc]] StableDiffusionXLControlNetXSPipeline
-	- all
-	- __call__
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
+> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
@@ -0,0 +1,58 @@
+# !pip install opencv-python transformers accelerate
+import argparse
+
+import cv2
+import numpy as np
+import torch
+from controlnetxs import ControlNetXSModel
+from PIL import Image
+from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
+
+from diffusers.utils import load_image
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+)
+parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches")
+parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7)
+parser.add_argument(
+    "--image_path",
+    type=str,
+    default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png",
+)
+parser.add_argument("--num_inference_steps", type=int, default=50)
+
+args = parser.parse_args()
+
+prompt = args.prompt
+negative_prompt = args.negative_prompt
+# download an image
+image = load_image(args.image_path)
+
+# initialize the models and pipeline
+controlnet_conditioning_scale = args.controlnet_conditioning_scale
+controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float16
+)
+pipe.enable_model_cpu_offload()
+
+# get canny image
+image = np.array(image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+num_inference_steps = args.num_inference_steps
+
+# generate image
+image = pipe(
+    prompt,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    image=canny_image,
+    num_inference_steps=num_inference_steps,
+).images[0]
+image.save("cnxs_sd.canny.png")
@@ -0,0 +1,57 @@
+# !pip install opencv-python transformers accelerate
+import argparse
+
+import cv2
+import numpy as np
+import torch
+from controlnetxs import ControlNetXSModel
+from PIL import Image
+from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
+
+from diffusers.utils import load_image
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+)
+parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches")
+parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7)
+parser.add_argument(
+    "--image_path",
+    type=str,
+    default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png",
+)
+parser.add_argument("--num_inference_steps", type=int, default=50)
+
+args = parser.parse_args()
+
+prompt = args.prompt
+negative_prompt = args.negative_prompt
+# download an image
+image = load_image(args.image_path)
+# initialize the models and pipeline
+controlnet_conditioning_scale = args.controlnet_conditioning_scale
+controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+)
+pipe.enable_model_cpu_offload()
+
+# get canny image
+image = np.array(image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+num_inference_steps = args.num_inference_steps
+
+# generate image
+image = pipe(
+    prompt,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    image=canny_image,
+    num_inference_steps=num_inference_steps,
+).images[0]
+image.save("cnxs_sdxl.canny.png")
@@ -19,75 +19,30 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
+from controlnetxs import ControlNetXSModel
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
    USE_PEFT_BACKEND,
    deprecate,
    logging,
-    replace_example_docstring,
    scale_lora_layers,
    unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
-from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSAdapter
-        >>> from diffusers.utils import load_image
-        >>> import numpy as np
-        >>> import torch
-
-        >>> import cv2
-        >>> from PIL import Image
-
-        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
-        >>> negative_prompt = "low quality, bad quality, sketches"
-
-        >>> # download an image
-        >>> image = load_image(
-        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
-        ... )
-
-        >>> # initialize the models and pipeline
-        >>> controlnet_conditioning_scale = 0.5
-
-        >>> controlnet = ControlNetXSAdapter.from_pretrained(
-        ...     "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
-        ... )
-        >>> pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
-        ... )
-        >>> pipe.enable_model_cpu_offload()
-
-        >>> # get canny image
-        >>> image = np.array(image)
-        >>> image = cv2.Canny(image, 100, 200)
-        >>> image = image[:, :, None]
-        >>> image = np.concatenate([image, image, image], axis=2)
-        >>> canny_image = Image.fromarray(image)
-        >>> # generate image
-        >>> image = pipe(
-        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
-        ... ).images[0]
-        ```
-"""
-
-
 class StableDiffusionControlNetXSPipeline(
    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
@@ -101,7 +56,7 @@ class StableDiffusionControlNetXSPipeline(
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files

    Args:
        vae ([`AutoencoderKL`]):
@@ -111,9 +66,9 @@ class StableDiffusionControlNetXSPipeline(
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        unet ([`UNet2DConditionModel`]):
-            A [`UNet2DConditionModel`] used to create a UNetControlNetXSModel to denoise the encoded image latents.
-        controlnet ([`ControlNetXSAdapter`]):
-            A [`ControlNetXSAdapter`] to be used in combination with `unet` to denoise the encoded image latents.
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetXSModel`]):
+            Provides additional conditioning to the `unet` during the denoising process.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -125,18 +80,17 @@ class StableDiffusionControlNetXSPipeline(
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """

-    model_cpu_offload_seq = "text_encoder->unet->vae"
+    model_cpu_offload_seq = "text_encoder->unet->vae>controlnet"
    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
-        unet: Union[UNet2DConditionModel, UNetControlNetXSModel],
-        controlnet: ControlNetXSAdapter,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetXSModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
@@ -144,9 +98,6 @@ class StableDiffusionControlNetXSPipeline(
    ):
        super().__init__()

-        if isinstance(unet, UNet2DConditionModel):
-            unet = UNetControlNetXSModel.from_unet(unet, controlnet)
-
        if safety_checker is None and requires_safety_checker:
            logger.warning(
                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
@@ -163,6 +114,14 @@ class StableDiffusionControlNetXSPipeline(
                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
            )

+        vae_compatible, cnxs_condition_downsample_factor, vae_downsample_factor = controlnet._check_if_vae_compatible(
+            vae
+        )
+        if not vae_compatible:
+            raise ValueError(
+                f"The downsampling factors of the VAE ({vae_downsample_factor}) and the conditioning part of ControlNetXS model {cnxs_condition_downsample_factor} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+            )
+
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
@@ -444,19 +403,20 @@ class StableDiffusionControlNetXSPipeline(
        self,
        prompt,
        image,
+        callback_steps,
        negative_prompt=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        controlnet_conditioning_scale=1.0,
        control_guidance_start=0.0,
        control_guidance_end=1.0,
-        callback_on_step_end_tensor_inputs=None,
    ):
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
            )

        if prompt is not None and prompt_embeds is not None:
@@ -485,16 +445,25 @@ class StableDiffusionControlNetXSPipeline(
                    f" {negative_prompt_embeds.shape}."
                )

-        # Check `image` and `controlnet_conditioning_scale`
+        # Check `image`
        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
-            self.unet, torch._dynamo.eval_frame.OptimizedModule
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
        )
        if (
-            isinstance(self.unet, UNetControlNetXSModel)
+            isinstance(self.controlnet, ControlNetXSModel)
            or is_compiled
-            and isinstance(self.unet._orig_mod, UNetControlNetXSModel)
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
        ):
            self.check_image(image, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
            if not isinstance(controlnet_conditioning_scale, float):
                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
        else:
@@ -578,12 +547,7 @@ class StableDiffusionControlNetXSPipeline(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -599,33 +563,7 @@ class StableDiffusionControlNetXSPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
-    def clip_skip(self):
-        return self._clip_skip
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
-    def num_timesteps(self):
-        return self._num_timesteps
-
    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        prompt: Union[str, List[str]] = None,
@@ -643,13 +581,13 @@ class StableDiffusionControlNetXSPipeline(
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
        control_guidance_start: float = 0.0,
        control_guidance_end: float = 1.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
    ):
        r"""
        The call function to the pipeline for generation.
@@ -657,7 +595,7 @@ class StableDiffusionControlNetXSPipeline(
        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
@@ -701,6 +639,12 @@ class StableDiffusionControlNetXSPipeline(
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -715,15 +659,7 @@ class StableDiffusionControlNetXSPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+
        Examples:

        Returns:
@@ -733,27 +669,21 @@ class StableDiffusionControlNetXSPipeline(
                second element is a list of `bool`s indicating whether the corresponding generated image contains
                "not-safe-for-work" (nsfw) content.
        """
-
-        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
            image,
+            callback_steps,
            negative_prompt,
            prompt_embeds,
            negative_prompt_embeds,
            controlnet_conditioning_scale,
            control_guidance_start,
            control_guidance_end,
-            callback_on_step_end_tensor_inputs,
        )

-        self._guidance_scale = guidance_scale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
@@ -783,7 +713,6 @@ class StableDiffusionControlNetXSPipeline(
            lora_scale=text_encoder_lora_scale,
            clip_skip=clip_skip,
        )
-
        # For classifier free guidance, we need to do two forward passes.
        # Here we concatenate the unconditional and text embeddings into a single batch
        # to avoid doing two forward passes
@@ -791,24 +720,27 @@ class StableDiffusionControlNetXSPipeline(
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

        # 4. Prepare image
-        image = self.prepare_image(
-            image=image,
-            width=width,
-            height=height,
-            batch_size=batch_size * num_images_per_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            device=device,
-            dtype=unet.dtype,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-        )
-        height, width = image.shape[-2:]
+        if isinstance(controlnet, ControlNetXSModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+            height, width = image.shape[-2:]
+        else:
+            assert False

        # 5. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps

        # 6. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -825,33 +757,42 @@ class StableDiffusionControlNetXSPipeline(

        # 8. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-        is_controlnet_compiled = is_compiled_module(self.unet)
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                # Relevant thread:
                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
-                if is_controlnet_compiled and is_torch_higher_equal_2_1:
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
                    torch._inductor.cudagraph_mark_step_begin()
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                apply_control = (
-                    i / len(timesteps) >= control_guidance_start and (i + 1) / len(timesteps) <= control_guidance_end
+                dont_control = (
+                    i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end
                )
-                noise_pred = self.unet(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    controlnet_cond=image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=True,
-                    apply_control=apply_control,
-                ).sample
+                if dont_control:
+                    noise_pred = self.unet(
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=True,
+                    ).sample
+                else:
+                    noise_pred = self.controlnet(
+                        base_model=self.unet,
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=True,
+                    ).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -860,18 +801,12 @@ class StableDiffusionControlNetXSPipeline(

                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
+                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)

        # If we do sequential model offloading, let's offload unet and controlnet
        # manually for max memory savings
@@ -19,93 +19,41 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer

-from diffusers.utils.import_utils import is_invisible_watermark_available
-
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
-from ...models.attention_processor import (
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel
+from diffusers.models.attention_processor import (
    AttnProcessor2_0,
    LoRAAttnProcessor2_0,
    LoRAXFormersAttnProcessor,
    XFormersAttnProcessor,
 )
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
    USE_PEFT_BACKEND,
    logging,
-    replace_example_docstring,
    scale_lora_layers,
    unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.utils.import_utils import is_invisible_watermark_available
+from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor


 if is_invisible_watermark_available():
-    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionXLControlNetXSPipeline, ControlNetXSAdapter, AutoencoderKL
-        >>> from diffusers.utils import load_image
-        >>> import numpy as np
-        >>> import torch
-
-        >>> import cv2
-        >>> from PIL import Image
-
-        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
-        >>> negative_prompt = "low quality, bad quality, sketches"
-
-        >>> # download an image
-        >>> image = load_image(
-        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
-        ... )
-
-        >>> # initialize the models and pipeline
-        >>> controlnet_conditioning_scale = 0.5
-        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-        >>> controlnet = ControlNetXSAdapter.from_pretrained(
-        ...     "UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
-        ... )
-        >>> pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
-        ... )
-        >>> pipe.enable_model_cpu_offload()
-
-        >>> # get canny image
-        >>> image = np.array(image)
-        >>> image = cv2.Canny(image, 100, 200)
-        >>> image = image[:, :, None]
-        >>> image = np.concatenate([image, image, image], axis=2)
-        >>> canny_image = Image.fromarray(image)
-
-        >>> # generate image
-        >>> image = pipe(
-        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
-        ... ).images[0]
-        ```
-"""
-
-
 class StableDiffusionXLControlNetXSPipeline(
    DiffusionPipeline,
+    StableDiffusionMixin,
    TextualInversionLoaderMixin,
    StableDiffusionXLLoraLoaderMixin,
    FromSingleFileMixin,
@@ -118,8 +66,9 @@ class StableDiffusionXLControlNetXSPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files

    Args:
        vae ([`AutoencoderKL`]):
@@ -134,9 +83,9 @@ class StableDiffusionXLControlNetXSPipeline(
        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        unet ([`UNet2DConditionModel`]):
-            A [`UNet2DConditionModel`] used to create a UNetControlNetXSModel to denoise the encoded image latents.
-        controlnet ([`ControlNetXSAdapter`]):
-            A [`ControlNetXSAdapter`] to be used in combination with `unet` to denoise the encoded image latents.
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetXSModel`]:
+            Provides additional conditioning to the `unet` during the denoising process.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -149,15 +98,9 @@ class StableDiffusionXLControlNetXSPipeline(
            watermarker is used.
    """

-    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = [
-        "tokenizer",
-        "tokenizer_2",
-        "text_encoder",
-        "text_encoder_2",
-        "feature_extractor",
-    ]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    # leave controlnet out on purpose because it iterates with unet
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae->controlnet"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]

    def __init__(
        self,
@@ -166,17 +109,21 @@ class StableDiffusionXLControlNetXSPipeline(
        text_encoder_2: CLIPTextModelWithProjection,
        tokenizer: CLIPTokenizer,
        tokenizer_2: CLIPTokenizer,
-        unet: Union[UNet2DConditionModel, UNetControlNetXSModel],
-        controlnet: ControlNetXSAdapter,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetXSModel,
        scheduler: KarrasDiffusionSchedulers,
        force_zeros_for_empty_prompt: bool = True,
        add_watermarker: Optional[bool] = None,
-        feature_extractor: CLIPImageProcessor = None,
    ):
        super().__init__()

-        if isinstance(unet, UNet2DConditionModel):
-            unet = UNetControlNetXSModel.from_unet(unet, controlnet)
+        vae_compatible, cnxs_condition_downsample_factor, vae_downsample_factor = controlnet._check_if_vae_compatible(
+            vae
+        )
+        if not vae_compatible:
+            raise ValueError(
+                f"The downsampling factors of the VAE ({vae_downsample_factor}) and the conditioning part of ControlNetXS model {cnxs_condition_downsample_factor} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+            )

        self.register_modules(
            vae=vae,
@@ -187,7 +134,6 @@ class StableDiffusionXLControlNetXSPipeline(
            unet=unet,
            controlnet=controlnet,
            scheduler=scheduler,
-            feature_extractor=feature_extractor,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
@@ -461,6 +407,7 @@ class StableDiffusionXLControlNetXSPipeline(
        prompt,
        prompt_2,
        image,
+        callback_steps,
        negative_prompt=None,
        negative_prompt_2=None,
        prompt_embeds=None,
@@ -470,13 +417,13 @@ class StableDiffusionXLControlNetXSPipeline(
        controlnet_conditioning_scale=1.0,
        control_guidance_start=0.0,
        control_guidance_end=1.0,
-        callback_on_step_end_tensor_inputs=None,
    ):
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
            )

        if prompt is not None and prompt_embeds is not None:
@@ -527,16 +474,25 @@ class StableDiffusionXLControlNetXSPipeline(
                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
            )

-        # Check `image` and ``controlnet_conditioning_scale``
+        # Check `image`
        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
-            self.unet, torch._dynamo.eval_frame.OptimizedModule
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
        )
        if (
-            isinstance(self.unet, UNetControlNetXSModel)
+            isinstance(self.controlnet, ControlNetXSModel)
            or is_compiled
-            and isinstance(self.unet._orig_mod, UNetControlNetXSModel)
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
        ):
            self.check_image(image, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
            if not isinstance(controlnet_conditioning_scale, float):
                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
        else:
@@ -621,12 +577,7 @@ class StableDiffusionXLControlNetXSPipeline(

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -642,6 +593,7 @@ class StableDiffusionXLControlNetXSPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
    def _get_add_time_ids(
        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
    ):
@@ -650,7 +602,7 @@ class StableDiffusionXLControlNetXSPipeline(
        passed_add_embed_dim = (
            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
        )
-        expected_add_embed_dim = self.unet.base_add_embedding.linear_1.in_features
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features

        if expected_add_embed_dim != passed_add_embed_dim:
            raise ValueError(
@@ -680,33 +632,7 @@ class StableDiffusionXLControlNetXSPipeline(
            self.vae.decoder.conv_in.to(dtype)
            self.vae.decoder.mid_block.to(dtype)

-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
-    def clip_skip(self):
-        return self._clip_skip
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
-    def num_timesteps(self):
-        return self._num_timesteps
-
    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        prompt: Union[str, List[str]] = None,
@@ -728,6 +654,8 @@ class StableDiffusionXLControlNetXSPipeline(
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
        control_guidance_start: float = 0.0,
@@ -739,8 +667,6 @@ class StableDiffusionXLControlNetXSPipeline(
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
    ):
        r"""
        The call function to the pipeline for generation.
@@ -751,7 +677,7 @@ class StableDiffusionXLControlNetXSPipeline(
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
@@ -809,6 +735,12 @@ class StableDiffusionXLControlNetXSPipeline(
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -851,15 +783,6 @@ class StableDiffusionXLControlNetXSPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.

        Examples:

@@ -868,14 +791,14 @@ class StableDiffusionXLControlNetXSPipeline(
                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] is
                returned, otherwise a `tuple` is returned containing the output images.
        """
-
-        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
            prompt_2,
            image,
+            callback_steps,
            negative_prompt,
            negative_prompt_2,
            prompt_embeds,
@@ -885,14 +808,8 @@ class StableDiffusionXLControlNetXSPipeline(
            controlnet_conditioning_scale,
            control_guidance_start,
            control_guidance_end,
-            callback_on_step_end_tensor_inputs,
        )

-        self._guidance_scale = guidance_scale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
@@ -933,7 +850,7 @@ class StableDiffusionXLControlNetXSPipeline(
        )

        # 4. Prepare image
-        if isinstance(unet, UNetControlNetXSModel):
+        if isinstance(controlnet, ControlNetXSModel):
            image = self.prepare_image(
                image=image,
                width=width,
@@ -941,7 +858,7 @@ class StableDiffusionXLControlNetXSPipeline(
                batch_size=batch_size * num_images_per_prompt,
                num_images_per_prompt=num_images_per_prompt,
                device=device,
-                dtype=unet.dtype,
+                dtype=controlnet.dtype,
                do_classifier_free_guidance=do_classifier_free_guidance,
            )
            height, width = image.shape[-2:]
@@ -953,7 +870,7 @@ class StableDiffusionXLControlNetXSPipeline(
        timesteps = self.scheduler.timesteps

        # 6. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -1011,14 +928,14 @@ class StableDiffusionXLControlNetXSPipeline(

        # 8. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-        is_controlnet_compiled = is_compiled_module(self.unet)
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                # Relevant thread:
                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
-                if is_controlnet_compiled and is_torch_higher_equal_2_1:
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
                    torch._inductor.cudagraph_mark_step_begin()
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -1027,20 +944,30 @@ class StableDiffusionXLControlNetXSPipeline(
                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}

                # predict the noise residual
-                apply_control = (
-                    i / len(timesteps) >= control_guidance_start and (i + 1) / len(timesteps) <= control_guidance_end
+                dont_control = (
+                    i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end
                )
-                noise_pred = self.unet(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    controlnet_cond=image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=True,
-                    apply_control=apply_control,
-                ).sample
+                if dont_control:
+                    noise_pred = self.unet(
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=True,
+                    ).sample
+                else:
+                    noise_pred = self.controlnet(
+                        base_model=self.unet,
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=True,
+                    ).sample

                # perform guidance
                if do_classifier_free_guidance:
@@ -1050,24 +977,12 @@ class StableDiffusionXLControlNetXSPipeline(
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
-
-        # manually for max memory savings
-        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
-            self.upcast_vae()
-            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)

        if not output_type == "latent":
            # make sure the VAE is in float32 mode, as it overflows in float16
@@ -484,10 +484,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -526,10 +526,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -516,10 +516,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -623,10 +623,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -21,7 +21,6 @@ import logging
 import math
 import os
 import shutil
-from contextlib import nullcontext
 from pathlib import Path

 import accelerate
@@ -411,10 +410,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

    if args.report_to == "wandb":
@@ -972,12 +967,9 @@ def main():
                # run inference
                original_image = download_image(args.val_image_url)
                edited_images = []
-                if torch.backends.mps.is_available():
-                    autocast_ctx = nullcontext()
-                else:
-                    autocast_ctx = torch.autocast(accelerator.device.type)
-
-                with autocast_ctx:
+                with torch.autocast(
+                    str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"
+                ):
                    for _ in range(args.num_validation_images):
                        edited_images.append(
                            pipeline(
@@ -378,10 +378,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)
@@ -411,11 +411,6 @@ def main():
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
-
-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -698,10 +698,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -566,10 +566,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -439,10 +439,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -581,10 +581,6 @@ def main():
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
@@ -295,10 +295,6 @@ def main(args):
        project_config=accelerator_project_config,
    )

-    # Disable AMP for MPS.
-    if torch.backends.mps.is_available():
-        accelerator.native_amp = False
-
    if args.logger == "tensorboard":
        if not is_tensorboard_available():
            raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.")
@@ -789,12 +789,7 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -123,12 +123,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
        return image_embeddings

    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1,15 +0,0 @@
-# Scheduled Pseudo-Huber Loss for Diffusers
-
-These are the modifications of to include the possibility of training text2image models with Scheduled Pseudo Huber loss, introduced in https://arxiv.org/abs/2403.16728. (https://github.com/kabachuha/SPHL-for-stable-diffusion)
-
-## Why this might be useful?
-
- If you suspect that the part of the training dataset might be corrupted, and you don't want these outliers to distort the model's supposed output
-
- If you want to improve the aesthetic quality of pictures by helping the model disentangle concepts and be less influenced by another sorts of pictures.
-
-See https://github.com/huggingface/diffusers/issues/7488 for the detailed description.
-
-## Instructions
-
-The same usage as in the case of the corresponding vanilla Diffusers scripts https://github.com/huggingface/diffusers/tree/main/examples
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
YiYi Xu	7eb2d2208e	Merge branch 'main' into fix-test	2024-03-31 22:07:28 -10:00
yiyixu	d97bca56ab	fix	2024-04-01 07:52:45 +00:00