update

2025-02-23 13:21:01 +01:00
83 changed files with 1163 additions and 3094 deletions
@@ -53,9 +53,9 @@ jobs:
          HEADREF: ${{ steps.pr_info.outputs.headRef }}
          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
        run: |
-          echo "PR number: $PRNUMBER"
-          echo "Head Ref: $HEADREF"
-          echo "Head Repo Full Name: $HEADREPOFULLNAME"
+          echo "PR number: ${{ env.PRNUMBER }}"
+          echo "Head Ref: ${{ env.HEADREF }}"
+          echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"

      - name: Set up Python
        uses: actions/setup-python@v4
@@ -89,20 +89,20 @@ jobs:
          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          echo "HEADREPOFULLNAME: $HEADREPOFULLNAME, HEADREF: $HEADREF"
+          echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
          # Configure git with the Actions bot user
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"

          # Make sure your 'origin' remote is set to the contributor's fork
-          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/$HEADREPOFULLNAME.git"
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"

          # If there are changes after running style/quality, commit them
          if [ -n "$(git status --porcelain)" ]; then
            git add .
            git commit -m "Apply style fixes"
            # Push to the original contributor's forked branch
-            git push origin HEAD:$HEADREF
+            git push origin HEAD:${{ env.HEADREF }}
            echo "changes_pushed=true" >> $GITHUB_OUTPUT
          else
            echo "No changes to commit."
@@ -1,243 +0,0 @@
-name: Fast GPU Tests on PR 
-
-on:
-  pull_request:
-    branches: main
-    paths:
-      - "src/diffusers/models/modeling_utils.py"
-      - "src/diffusers/models/model_loading_utils.py"
-      - "src/diffusers/pipelines/pipeline_utils.py"
-      - "src/diffusers/pipeline_loading_utils.py"
-      - "src/diffusers/loaders/lora_base.py"
-      - "src/diffusers/loaders/lora_pipeline.py"
-      - "src/diffusers/loaders/peft.py"
-      - "tests/pipelines/test_pipelines_common.py"
-      - "tests/models/test_modeling_common.py"
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  HF_HUB_ENABLE_HF_TRANSFER: 1
-  PYTEST_TIMEOUT: 600
-  PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
-
-jobs:
-  setup_torch_cuda_pipeline_matrix:
-    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus
-    container:
-      image: diffusers/diffusers-pytorch-cpu
-    outputs:
-      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Fetch Pipeline Matrix
-        id: fetch_pipeline_matrix
-        run: |
-          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
-          echo $matrix
-          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
-      - name: Pipeline Tests Artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-pipelines.json
-          path: reports
-
-  torch_pipelines_cuda_tests:
-    name: Torch Pipelines CUDA Tests
-    needs: setup_torch_cuda_pipeline_matrix
-    strategy:
-      fail-fast: false
-      max-parallel: 8
-      matrix:
-        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Extract tests
-        id: extract_tests
-        run: |
-          pattern=$(python utils/extract_tests_from_mixin.py --type pipeline)
-          echo "$pattern" > /tmp/test_pattern.txt
-          echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
-
-      - name: PyTorch CUDA checkpoint tests on Ubuntu
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-        run: |
-          pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx and $pattern" \
-            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            tests/pipelines/${{ matrix.module }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
-          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: pipeline_${{ matrix.module }}_test_reports
-          path: reports
-
-  torch_cuda_tests:
-    name: Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-      matrix:
-        module: [models, schedulers, lora, others]
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-
-    - name: Environment
-      run: |
-        python utils/print_env.py
-
-    - name: Extract tests
-      id: extract_tests
-      run: |
-        pattern=$(python utils/extract_tests_from_mixin.py --type ${{ matrix.module }})
-        echo "$pattern" > /tmp/test_pattern.txt
-        echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
-
-    - name: Run PyTorch CUDA tests
-      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8
-      run: |
-        pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
-        if [ -z "$pattern" ]; then
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
-        else
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
-        fi
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
-        cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: torch_cuda_test_reports_${{ matrix.module }}
-        path: reports
-
-  run_examples_tests:
-    name: Examples PyTorch CUDA tests on Ubuntu
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-    runs-on:
-      group: aws-g4dn-2xlarge
-
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: NVIDIA-SMI
-      run: |
-        nvidia-smi
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
-
-    - name: Environment
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python utils/print_env.py
-
-    - name: Run example tests on GPU
-      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/examples_torch_cuda_stats.txt
-        cat reports/examples_torch_cuda_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: examples_test_reports
-        path: reports
-
@@ -1,6 +1,13 @@
 name: Fast GPU Tests on main

 on:
+  pull_request:
+    branches: main
+    paths:
+      - "src/diffusers/models/modeling_utils.py"
+      - "src/diffusers/models/model_loading_utils.py"
+      - "src/diffusers/pipelines/pipeline_utils.py"
+      - "src/diffusers/pipeline_loading_utils.py"
  workflow_dispatch:
  push:
    branches:
@@ -160,6 +167,7 @@ jobs:
        path: reports

  flax_tpu_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: Flax TPU Tests
    runs-on:
      group: gcp-ct5lp-hightpu-8t
@@ -208,6 +216,7 @@ jobs:
        path: reports

  onnx_cuda_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: ONNX CUDA Tests
    runs-on:
      group: aws-g4dn-2xlarge
@@ -256,6 +265,7 @@ jobs:
        path: reports

  run_torch_compile_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: PyTorch Compile CUDA tests

    runs-on:
@@ -299,6 +309,7 @@ jobs:
        path: reports

  run_xformers_tests:
+    if: ${{ github.event_name != 'pull_request' }}
    name: PyTorch xformers CUDA tests

    runs-on:
@@ -543,10 +543,6 @@
      title: Overview
    - local: api/schedulers/cm_stochastic_iterative
      title: CMStochasticIterativeScheduler
-    - local: api/schedulers/ddim_cogvideox
-      title: CogVideoXDDIMScheduler
-    - local: api/schedulers/multistep_dpm_solver_cogvideox
-      title: CogVideoXDPMScheduler
    - local: api/schedulers/consistency_decoder
      title: ConsistencyDecoderScheduler
    - local: api/schedulers/cosine_dpm
@@ -359,74 +359,8 @@ image.save('flux_ip_adapter_output.jpg')
    <figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
 </div>

-## Optimize

-Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeling components. Enable some of the optimizations below to lower the memory requirements.
-
-### Group offloading
-
-[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
-
-On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
-
-> [!TIP]
-> It is possible to mix block and leaf-level offloading for different components in a pipeline.
-
-```py
-import torch
-from diffusers import FluxPipeline
-from diffusers.hooks import apply_group_offloading
-
-model_id = "black-forest-labs/FLUX.1-dev"
-dtype = torch.bfloat16
-pipe = FluxPipeline.from_pretrained(
-	model_id,
-	torch_dtype=dtype,
-)
-
-apply_group_offloading(
-    pipe.transformer,
-    offload_type="leaf_level",
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.text_encoder, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.text_encoder_2, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-apply_group_offloading(
-    pipe.vae, 
-    offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
-    offload_type="leaf_level",
-    use_stream=True,
-)
-
-prompt="A cat wearing sunglasses and working as a lifeguard at pool."
-
-generator = torch.Generator().manual_seed(181201)
-image = pipe(
-    prompt,
-    width=576,
-    height=1024,
-    num_inference_steps=30,
-    generator=generator
-).images[0]
-image
-```
-
-### Running FP16 inference
+## Running FP16 inference

 Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.

@@ -455,7 +389,7 @@ out = pipe(
 out.save("image.png")
 ```

-### Quantization
+## Quantization

 Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.

@@ -1,6 +1,4 @@
-<!--
-Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,120 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Marigold Computer Vision
+# Marigold Pipelines for Computer Vision Tasks

 ![marigold](https://marigoldmonodepth.github.io/images/teaser_collage_compressed.jpg)

-Marigold was proposed in 
-[Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), 
-a CVPR 2024 Oral paper by 
-[Bingxin Ke](http://www.kebingxin.com/), 
-[Anton Obukhov](https://www.obukhov.ai/), 
-[Shengyu Huang](https://shengyuh.github.io/), 
-[Nando Metzger](https://nandometzger.github.io/), 
-[Rodrigo Caye Daudt](https://rcdaudt.github.io/), and 
-[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
-The core idea is to **repurpose the generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional 
-computer vision tasks**.
-This approach was explored by fine-tuning Stable Diffusion for **Monocular Depth Estimation**, as demonstrated in the 
-teaser above.
+Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
+The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks.
+Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above.
+Later,
+- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
+- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
+- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).

-Marigold was later extended in the follow-up paper, 
-[Marigold: Affordable Adaptation of Diffusion-Based Image Generators for Image Analysis](https://huggingface.co/papers/2312.02145), 
-authored by 
-[Bingxin Ke](http://www.kebingxin.com/), 
-[Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US), 
-[Tianfu Wang](https://tianfwang.github.io/), 
-[Nando Metzger](https://nandometzger.github.io/), 
-[Shengyu Huang](https://shengyuh.github.io/), 
-[Bo Li](https://www.linkedin.com/in/bobboli0202/), 
-[Anton Obukhov](https://www.obukhov.ai/), and 
-[Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
-This work expanded Marigold to support new modalities such as **Surface Normals** and **Intrinsic Image Decomposition** 
-(IID), introduced a training protocol for **Latent Consistency Models** (LCM), and demonstrated **High-Resolution** (HR) 
-processing capability.
+The abstract from the paper is:

-<Tip>
-
-The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
-LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
-Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal 
-results in as few as 1 to 4 steps.
-
-</Tip>
+*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*

 ## Available Pipelines

-Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a 
-corresponding prediction.
-Currently, the following computer vision tasks are implemented:
+Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
+Currently, the following tasks are implemented:
+
+| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
+|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
+| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
+| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |

-| Pipeline                                                                                                                                          | Recommended Model Checkpoints                                                                                                                                                                           |                              Spaces (Interactive Apps)                               | Predicted Modalities                                                                                                                                                               |
-|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)           | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                                                                                                                       |          [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold)          | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity)                                                                   |
-| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py)       | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1)                                                                                                                   | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                                                                                    |
-| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid)  | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection)   |

 ## Available Checkpoints

-All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
-They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train 
-new model checkpoints.
-The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps. 
-
-| Checkpoint                                                                                          | Modality     | Comment                                                                                                                                                                              |
-|-----------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                   | Depth        | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference.                    |
-| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)               | Normals      | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1.                                                        |
-| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                                      | 
-| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
+The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.

 <Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 
-between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to 
-efficiently load the same components into multiple pipelines. 
-Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section 
-[here](../../using-diffusers/svd#reduce-memory-usage).
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).

 </Tip>

 <Tip warning={true}>

-Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
-The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
-To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the 
-API reference).
-Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration 
-file (`model_index.json`).
-This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
+Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`.
+Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
+Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference).
+Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`.
+This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument.

 </Tip>

-See also Marigold [usage examples](../../using-diffusers/marigold_usage).
-
-## Marigold Depth Prediction API
+See also Marigold [usage examples](marigold_usage).

+## MarigoldDepthPipeline
 [[autodoc]] MarigoldDepthPipeline
+	- all
 	- __call__

+## MarigoldNormalsPipeline
+[[autodoc]] MarigoldNormalsPipeline
+	- all
+	- __call__
+
+## MarigoldDepthOutput
 [[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput

-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth
-
-## Marigold Normals Estimation API
-[[autodoc]] MarigoldNormalsPipeline
-	- __call__
-
-[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
-
-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals
-
-## Marigold Intrinsic Image Decomposition API
-
-[[autodoc]] MarigoldIntrinsicsPipeline
-	- __call__
-
-[[autodoc]] pipelines.marigold.pipeline_marigold_intrinsics.MarigoldIntrinsicsOutput
-
-[[autodoc]] pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics
+## MarigoldNormalsOutput
+[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Latte](latte) | text2image |
 | [LEDITS++](ledits_pp) | image editing |
 | [Lumina-T2X](lumina) | text2image |
-| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
+| [Marigold](marigold) | depth |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [PAG](pag) | text2image |
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CogVideoXDDIMScheduler
-
-`CogVideoXDDIMScheduler` is based on [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502), specifically for CogVideoX models.
-
-## CogVideoXDDIMScheduler
-
-[[autodoc]] CogVideoXDDIMScheduler
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CogVideoXDPMScheduler
-
-`CogVideoXDPMScheduler` is based on [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095), specifically for CogVideoX models.
-
-## CogVideoXDPMScheduler
-
-[[autodoc]] CogVideoXDPMScheduler
@@ -1,6 +1,4 @@
-<!--
-Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -12,38 +10,31 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Marigold Computer Vision
+# Marigold Pipelines for Computer Vision Tasks

-**Marigold** is a diffusion-based [method](https://huggingface.co/papers/2312.02145) and a collection of [pipelines](../api/pipelines/marigold) designed for 
-dense computer vision tasks, including **monocular depth prediction**, **surface normals estimation**, and **intrinsic 
-image decomposition**.
+[Marigold](../api/pipelines/marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.

-This guide will walk you through using Marigold to generate fast and high-quality predictions for images and videos.
+This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos.

-Each pipeline is tailored for a specific computer vision task, processing an input RGB image and generating a 
-corresponding prediction.
-Currently, the following computer vision tasks are implemented:
+Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
+Currently, the following tasks are implemented:

-| Pipeline                                                                                                                                          | Recommended Model Checkpoints                                                                                                                                                                           |                              Spaces (Interactive Apps)                               | Predicted Modalities                                                                                                                                                               |
-|---------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)           | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                                                                                                                       |          [Depth Estimation](https://huggingface.co/spaces/prs-eth/marigold)          | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity)                                                                   |
-| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py)       | [prs-eth/marigold-normals-v1-1](https://huggingface.co/prs-eth/marigold-normals-v1-1)                                                                                                                   | [Surface Normals Estimation](https://huggingface.co/spaces/prs-eth/marigold-normals) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                                                                                    |
-| [MarigoldIntrinsicsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py) | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1),<br>[prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | [Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid)  | [Albedo](https://en.wikipedia.org/wiki/Albedo), [Materials](https://www.n.aiq3d.com/wiki/roughnessmetalnessao-map), [Lighting](https://en.wikipedia.org/wiki/Diffuse_reflection)   |
+| Pipeline                                                                                                                                    | Predicted Modalities                                                                                             |                                                                       Demos                                                                        |
+|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
+| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py)     | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
+| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping)                                                  |                                   [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm)                                    |

-All original checkpoints are available under the [PRS-ETH](https://huggingface.co/prs-eth/) organization on Hugging Face.
-They are designed for use with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold), which can also be used to train 
-new model checkpoints. 
-The following is a summary of the recommended checkpoints, all of which produce reliable results with 1 to 4 steps. 
+The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
+These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold).
+The original code can also be used to train new checkpoints.

-| Checkpoint                                                                                          | Modality     | Comment                                                                                                                                                           |
-|-----------------------------------------------------------------------------------------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                   | Depth        | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
-| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)               | Normals      | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1.                                     |
-| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                   | 
-| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image \\(I\\) is comprised of Albedo \\(A\\), Diffuse shading \\(S\\), and Non-diffuse residual \\(R\\): \\(I = A*S+R\\).            | 
-
-The examples below are mostly given for depth prediction, but they can be universally applied to other supported 
-modalities.
+| Checkpoint                                                                                    | Modality | Comment                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0)                         | Depth    | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
+| [prs-eth/marigold-depth-lcm-v1-0](https://huggingface.co/prs-eth/marigold-depth-lcm-v1-0)     | Depth    | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that.                                                                                                                                                                                                                                                                                                                           |
+| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)         | Normals  | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                              |
+| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals  | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.*                                                                                                                                                                                                                                       |
+The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities.
 We showcase the predictions using the same input image of Albert Einstein generated by Midjourney.
 This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.

@@ -56,21 +47,19 @@ This makes it easier to compare visualizations of the predictions across various
  </div>
 </div>

-## Depth Prediction
+### Depth Prediction Quick Start

-To get a depth prediction, load the `prs-eth/marigold-depth-v1-1` checkpoint into [`MarigoldDepthPipeline`], 
-put the image through the pipeline, and save the predictions:
+To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions:

 ```python
 import diffusers
 import torch

 pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
+    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
 ).to("cuda")

 image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
 depth = pipe(image)

 vis = pipe.image_processor.visualize_depth(depth.prediction)
@@ -80,13 +69,10 @@ depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
 depth_16bit[0].save("einstein_depth_16bit.png")
 ```

-The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] function applies one of 
-[matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` 
-depth range into an RGB image.
-With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are blue.
+The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image.
+With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
 The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
-Below are the raw and the visualized predictions. The darker and closer areas (mustache) are easier to distinguish in 
-the visualization.
+Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:

 <div class="flex gap-4">
  <div style="flex: 1 1 50%; max-width: 50%;">
@@ -103,33 +89,28 @@ the visualization.
  </div>
 </div>

-## Surface Normals Estimation
+### Surface Normals Prediction Quick Start

-Load the `prs-eth/marigold-normals-v1-1` checkpoint into [`MarigoldNormalsPipeline`], put the image through the 
-pipeline, and save the predictions:
+Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions:

 ```python
 import diffusers
 import torch

 pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
-    "prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
+    "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
 ).to("cuda")

 image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
 normals = pipe(image)

 vis = pipe.image_processor.visualize_normals(normals.prediction)
 vis[0].save("einstein_normals.png")
 ```

-The [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional 
-prediction with pixel values in the range `[-1, 1]` into an RGB image.
-The visualization function supports flipping surface normals axes to make the visualization compatible with other 
-choices of the frame of reference.
-Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis 
-points right, `Y` axis points up, and `Z` axis points at the viewer.
+The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image.
+The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference.
+Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
 Below is the visualized prediction:

 <div class="flex gap-4" style="justify-content: center; width: 100%;">
@@ -141,121 +122,25 @@ Below is the visualized prediction:
  </div>
 </div>

-In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points 
-straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
+In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
 This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
-Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the 
-red hue.
+Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue.
 Points on the shoulders pointing up with a large `Y` promote green color.

-## Intrinsic Image Decomposition
+### Speeding up inference

-Marigold provides two models for Intrinsic Image Decomposition (IID): "Appearance" and "Lighting". 
-Each model produces Albedo maps, derived from InteriorVerse and Hypersim annotations, respectively.
-
- The "Appearance" model also estimates Material properties: Roughness and Metallicity.
- The "Lighting" model generates Diffuse Shading and Non-diffuse Residual.
-
-Here is the sample code saving predictions made by the "Appearance" model:
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
-    "prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-intrinsics = pipe(image)
-
-vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
-vis[0]["albedo"].save("einstein_albedo.png")
-vis[0]["roughness"].save("einstein_roughness.png")
-vis[0]["metallicity"].save("einstein_metallicity.png")
-```
-
-Another example demonstrating the predictions made by the "Lighting" model:
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
-    "prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-intrinsics = pipe(image)
-
-vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
-vis[0]["albedo"].save("einstein_albedo.png")
-vis[0]["shading"].save("einstein_shading.png")
-vis[0]["residual"].save("einstein_residual.png")
-```
-
-Both models share the same pipeline while supporting different decomposition types.
-The exact decomposition parameterization (e.g., sRGB vs. linear space) is stored in the 
-`pipe.target_properties` dictionary, which is passed into the 
-[`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_intrinsics`] function.
-
-Below are some examples showcasing the predicted decomposition outputs. 
-All modalities can be inspected in the 
-[Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-iid) Space.
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_albedo.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-      Predicted albedo ("Appearance" model)
-    </figcaption>
-  </div>
-  <div style="flex: 1 1 50%; max-width: 50%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/8c7986eaaab5eb9604eb88336311f46a7b0ff5ab/marigold/marigold_einstein_diffuse.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-      Predicted diffuse shading ("Lighting" model)
-    </figcaption>
-  </div>
-</div>
-
-## Speeding up inference
-
-The above quick start snippets are already optimized for quality and speed, loading the checkpoint, utilizing the 
-`fp16` variant of weights and computation, and performing the default number (4) of denoising diffusion steps.
-The first step to accelerate inference, at the expense of prediction quality, is to reduce the denoising diffusion 
-steps to the minimum:
+The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step.
+The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
+Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
+In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
+Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny):

 ```diff
  import diffusers
  import torch

  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
-  ).to("cuda")
-
-  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-  
- depth = pipe(image)
-+ depth = pipe(image, num_inference_steps=1)
-```
-
-With this change, the `pipe` call completes in 280ms on RTX 3090 GPU.
-Internally, the input image is first encoded using the Stable Diffusion VAE encoder, followed by a single denoising 
-step performed by the U-Net. 
-Finally, the prediction latent is decoded with the VAE decoder into pixel space.
-In this setup, two out of three module calls are dedicated to converting between the pixel and latent spaces of the LDM.
-Since Marigold's latent space is compatible with Stable Diffusion 2.0, inference can be accelerated by more than 3x, 
-reducing the call time to 85ms on an RTX 3090, by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny). 
-Note that using a lightweight VAE may slightly reduce the visual quality of the predictions.
-
-```diff
-  import diffusers
-  import torch
-
-  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
+      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
  ).to("cuda")

 + pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
@@ -263,77 +148,78 @@ Note that using a lightweight VAE may slightly reduce the visual quality of the
 + ).cuda()

  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-  depth = pipe(image, num_inference_steps=1)
+  depth = pipe(image)
 ```

-So far, we have optimized the number of diffusion steps and model components. Self-attention operations account for a 
-significant portion of computations. 
-Speeding them up can be achieved by using a more efficient attention processor:
+As suggested in [Optimizations](../optimization/torch2.0#torch.compile), adding `torch.compile` may squeeze extra performance depending on the target hardware:

 ```diff
  import diffusers
  import torch
-+ from diffusers.models.attention_processor import AttnProcessor2_0

  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
+      "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
  ).to("cuda")

-+ pipe.vae.set_attn_processor(AttnProcessor2_0()) 
-+ pipe.unet.set_attn_processor(AttnProcessor2_0())
-
-  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-  depth = pipe(image, num_inference_steps=1)
-```
-
-Finally, as suggested in [Optimizations](../optimization/torch2.0#torch.compile), enabling `torch.compile` can further enhance performance depending on 
-the target hardware.
-However, compilation incurs a significant overhead during the first pipeline invocation, making it beneficial only when 
-the same pipeline instance is called repeatedly, such as within a loop.
-
-```diff
-  import diffusers
-  import torch
-  from diffusers.models.attention_processor import AttnProcessor2_0
-
-  pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-      "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
-  ).to("cuda")
-
-  pipe.vae.set_attn_processor(AttnProcessor2_0()) 
-  pipe.unet.set_attn_processor(AttnProcessor2_0())
-
-+ pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
 + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)

  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-  depth = pipe(image, num_inference_steps=1)
+  depth = pipe(image)
 ```

+## Qualitative Comparison with Depth Anything
+
+With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Marigold LCM fp16 with Tiny AutoEncoder
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/einstein_depthanything_large.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Depth Anything Large
+    </figcaption>
+  </div>
+</div>
+
 ## Maximizing Precision and Ensembling

 Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents.
 This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
-The ensembling path is activated automatically when the `ensemble_size` argument is set greater or equal than `3`.
+The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`.
 When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`.
 The recommended values vary across checkpoints but primarily depend on the scheduler type.
 The effect of ensembling is particularly well-seen with surface normals:

-```diff
-  import diffusers
+```python
+import diffusers

-  pipe = diffusers.MarigoldNormalsPipeline.from_pretrained("prs-eth/marigold-normals-v1-1").to("cuda")
+model_path = "prs-eth/marigold-normals-v1-0"

-  image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+model_paper_kwargs = {
+	diffusers.schedulers.DDIMScheduler: {
+		"num_inference_steps": 10,
+		"ensemble_size": 10,
+	},
+	diffusers.schedulers.LCMScheduler: {
+		"num_inference_steps": 4,
+		"ensemble_size": 5,
+	},
+}

- depth = pipe(image)
-+ depth = pipe(image, num_inference_steps=10, ensemble_size=5)
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")

-  vis = pipe.image_processor.visualize_normals(depth.prediction)
-  vis[0].save("einstein_normals.png")
+pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
+pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
+
+depth = pipe(image, **pipe_kwargs)
+
+vis = pipe.image_processor.visualize_normals(depth.prediction)
+vis[0].save("einstein_normals.png")
 ```

 <div class="flex gap-4">
@@ -351,16 +237,93 @@ The effect of ensembling is particularly well-seen with surface normals:
  </div>
 </div>

-As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more 
-correct predictions.
+As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
 Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.

+## Quantitative Evaluation
+
+To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`.
+Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
+
+```python
+import diffusers
+import torch
+
+device = "cuda"
+seed = 2024
+model_path = "prs-eth/marigold-v1-0"
+
+model_paper_kwargs = {
+	diffusers.schedulers.DDIMScheduler: {
+		"num_inference_steps": 50,
+		"ensemble_size": 10,
+	},
+	diffusers.schedulers.LCMScheduler: {
+		"num_inference_steps": 4,
+		"ensemble_size": 10,
+	},
+}
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+
+generator = torch.Generator(device=device).manual_seed(seed)
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
+pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
+
+depth = pipe(image, generator=generator, **pipe_kwargs)
+
+# evaluate metrics
+```
+
+## Using Predictive Uncertainty
+
+The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents.
+As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
+The resulting uncertainty will be available in the `uncertainty` field of the output.
+It can be visualized as follows:
+
+```python
+import diffusers
+import torch
+
+pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
+    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
+).to("cuda")
+
+image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+depth = pipe(
+	image,
+	ensemble_size=10,  # any number greater than 1; higher values yield higher precision
+	output_uncertainty=True,
+)
+
+uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
+uncertainty[0].save("einstein_depth_uncertainty.png")
+```
+
+<div class="flex gap-4">
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Depth uncertainty
+    </figcaption>
+  </div>
+  <div style="flex: 1 1 50%; max-width: 50%;">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
+    <figcaption class="mt-1 text-center text-sm text-gray-500">
+      Surface normals uncertainty
+    </figcaption>
+  </div>
+</div>
+
+The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
+Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
+The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
+
 ## Frame-by-frame Video Processing with Temporal Consistency

-Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent 
-initialization.
-This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the 
-following videos:
+Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization.
+This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:

 <div class="flex gap-4">
  <div style="flex: 1 1 50%; max-width: 50%;">
@@ -373,32 +336,26 @@ following videos:
  </div>
 </div>

-To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of 
-diffusion.
-Empirically, we found that a convex combination of the very same starting point noise latent and the latent 
-corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
+To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
+Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:

 ```python
 import imageio
-import diffusers
-import torch
-from diffusers.models.attention_processor import AttnProcessor2_0
 from PIL import Image
 from tqdm import tqdm
+import diffusers
+import torch

 device = "cuda"
-path_in = "https://huggingface.co/spaces/prs-eth/marigold-lcm/resolve/c7adb5427947d2680944f898cd91d386bf0d4924/files/video/obama.mp4"
+path_in = "obama.mp4"
 path_out = "obama_depth.gif"

 pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
+    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
 ).to(device)
 pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
    "madebyollin/taesd", torch_dtype=torch.float16
 ).to(device)
-pipe.unet.set_attn_processor(AttnProcessor2_0())
-pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)
-pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 pipe.set_progress_bar_config(disable=True)

 with imageio.get_reader(path_in) as reader:
@@ -416,11 +373,7 @@ with imageio.get_reader(path_in) as reader:
            latents = 0.9 * latents + 0.1 * last_frame_latent

        depth = pipe(
-            frame,
-            num_inference_steps=1,
-            match_input_resolution=False, 
-            latents=latents, 
-            output_latent=True,
+			frame, match_input_resolution=False, latents=latents, output_latent=True
        )
        last_frame_latent = depth.latent
        out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
@@ -429,8 +382,7 @@ with imageio.get_reader(path_in) as reader:
 ```

 Here, the diffusion process starts from the given computed latent.
-The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent 
-initialization.
+The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
 The result is much more stable now:

 <div class="flex gap-4">
@@ -462,7 +414,7 @@ image = diffusers.utils.load_image(
 )

 pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-v1-1", torch_dtype=torch.float16, variant="fp16"
+    "prs-eth/marigold-depth-lcm-v1-0", torch_dtype=torch.float16, variant="fp16"
 ).to(device)

 depth_image = pipe(image, generator=generator).prediction
@@ -511,95 +463,4 @@ controlnet_out[0].save("motorcycle_controlnet_out.png")
  </div>
 </div>

-## Quantitative Evaluation
-
-To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), 
-follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values 
-for `num_inference_steps` and `ensemble_size`.
-Optionally seed randomness to ensure reproducibility. 
-Maximizing `batch_size` will deliver maximum device utilization.
-
-```python
-import diffusers
-import torch
-
-device = "cuda"
-seed = 2024
-
-generator = torch.Generator(device=device).manual_seed(seed)
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained("prs-eth/marigold-depth-v1-1").to(device)
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-depth = pipe(
-    image, 
-    num_inference_steps=4,  # set according to the evaluation protocol from the paper
-    ensemble_size=10,       # set according to the evaluation protocol from the paper
-    generator=generator,
-)
-
-# evaluate metrics
-```
-
-## Using Predictive Uncertainty
-
-The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random 
-latents.
-As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater 
-or equal than 3 and set `output_uncertainty=True`.
-The resulting uncertainty will be available in the `uncertainty` field of the output.
-It can be visualized as follows:
-
-```python
-import diffusers
-import torch
-
-pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-    "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
-).to("cuda")
-
-image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
-
-depth = pipe(
-	image,
-	ensemble_size=10,  # any number >= 3
-	output_uncertainty=True,
-)
-
-uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
-uncertainty[0].save("einstein_depth_uncertainty.png")
-```
-
-<div class="flex gap-4">
-  <div style="flex: 1 1 33%; max-width: 33%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-      Depth uncertainty
-    </figcaption>
-  </div>
-  <div style="flex: 1 1 33%; max-width: 33%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-      Surface normals uncertainty
-    </figcaption>
-  </div>
-  <div style="flex: 1 1 33%; max-width: 33%;">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/4f83035d84a24e5ec44fdda129b1d51eba12ce04/marigold/marigold_einstein_albedo_uncertainty.png"/>
-    <figcaption class="mt-1 text-center text-sm text-gray-500">
-      Albedo uncertainty
-    </figcaption>
-  </div>
-</div>
-
-The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to 
-make consistent predictions.
- The depth model exhibits the most uncertainty around discontinuities, where object depth changes abruptly.
- The surface normals model is least confident in fine-grained structures like hair and in dark regions such as the 
-collar area.
- Albedo uncertainty is represented as an RGB image, as it captures uncertainty independently for each color channel, 
-unlike depth and surface normals. It is also higher in shaded regions and at discontinuities.
-
-## Conclusion
-
-We hope Marigold proves valuable for your downstream tasks, whether as part of a broader generative workflow or for 
-perception-based applications like 3D reconstruction.
+Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a perception task, such as 3D reconstruction.
@@ -215,7 +215,7 @@ image

 Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).

-Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
+Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].

 <Tip>

@@ -223,99 +223,136 @@ If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open

 </Tip>

-This guide will show you how to weight your prompts with sd_embed.
+This guide will show you how to weight and blend your prompts with Compel in 🤗 Diffusers.

-Before you begin, make sure you have the latest version of sd_embed installed:
-
-```bash
-pip install git+https://github.com/xhinker/sd_embed.git@main
-```
-
-For this example, let's use [`StableDiffusionXLPipeline`].
+Before you begin, make sure you have the latest version of Compel installed:

 ```py
-from diffusers import StableDiffusionXLPipeline, UniPCMultistepScheduler
+# uncomment to install in Colab
+#!pip install compel --upgrade
+```
+
+For this guide, let's generate an image with the prompt `"a red cat playing with a ball"` using the [`StableDiffusionPipeline`]:
+
+```py
+from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
 import torch

-pipe = StableDiffusionXLPipeline.from_pretrained("Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16)
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_safetensors=True)
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.to("cuda")
-```

-To upweight or downweight a concept, surround the text with parentheses. More parentheses applies a heavier weight on the text. You can also append a numerical multiplier to the text to indicate how much you want to increase or decrease its weights by.
+prompt = "a red cat playing with a ball"

-| format | multiplier |
-|---|---|
-| `(hippo)` | increase by 1.1x |
-| `((hippo))` | increase by 1.21x |
-| `(hippo:1.5)` | increase by 1.5x |
-| `(hippo:0.5)` | decrease by 4x |
+generator = torch.Generator(device="cpu").manual_seed(33)

-Create a prompt and use a combination of parentheses and numerical multipliers to upweight various text.
-
-```py
-from sd_embed.embedding_funcs import get_weighted_text_embeddings_sdxl
-
-prompt = """A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
-```
-
-Use the `get_weighted_text_embeddings_sdxl` function to generate the prompt embeddings and the negative prompt embeddings. It'll also generated the pooled and negative pooled prompt embeddings since you're using the SDXL model.
-
-> [!TIP]
-> You can safely ignore the error message below about the token index length exceeding the models maximum sequence length. All your tokens will be used in the embedding process.
->
-> ```
-> Token indices sequence length is longer than the specified maximum sequence length for this model
-> ```
-
-```py
-( 
-  prompt_embeds,
-  prompt_neg_embeds,
-  pooled_prompt_embeds,
-  negative_pooled_prompt_embeds
-) = get_weighted_text_embeddings_sdxl(
-    pipe,
-    prompt=prompt,
-    neg_prompt=neg_prompt
-)
-
-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=prompt_neg_embeds,
-    pooled_prompt_embeds=pooled_prompt_embeds,
-    negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-    num_inference_steps=30,
-    height=1024,
-    width=1024 + 512,
-    guidance_scale=4.0,
-    generator=torch.Generator("cuda").manual_seed(2)
-).images[0]
+image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
 image
 ```

 <div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_sdxl.png"/>
+  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
 </div>

-> [!TIP]
-> Refer to the [sd_embed](https://github.com/xhinker/sd_embed) repository for additional details about long prompt weighting for FLUX.1, Stable Cascade, and Stable Diffusion 1.5.
+### Weighting
+
+You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:
+
+```py
+from compel import Compel
+
+compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
+```
+
+compel uses `+` or `-` to increase or decrease the weight of a word in the prompt. To increase the weight of "ball":
+
+<Tip>
+
+`+` corresponds to the value `1.1`, `++` corresponds to `1.1^2`, and so on. Similarly, `-` corresponds to `0.9` and `--` corresponds to `0.9^2`. Feel free to experiment with adding more `+` or `-` in your prompt!
+
+</Tip>
+
+```py
+prompt = "a red cat playing with a ball++"
+```
+
+Pass the prompt to `compel_proc` to create the new prompt embeddings which are passed to the pipeline:
+
+```py
+prompt_embeds = compel_proc(prompt)
+generator = torch.manual_seed(33)
+
+image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_1.png"/>
+</div>
+
+To downweight parts of the prompt, use the `-` suffix:
+
+```py
+prompt = "a red------- cat playing with a ball"
+prompt_embeds = compel_proc(prompt)
+
+generator = torch.manual_seed(33)
+
+image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"/>
+</div>
+
+You can even up or downweight multiple concepts in the same prompt:
+
+```py
+prompt = "a red cat++ playing with a ball----"
+prompt_embeds = compel_proc(prompt)
+
+generator = torch.manual_seed(33)
+
+image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
+</div>
+
+### Blending
+
+You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!
+
+```py
+prompt_embeds = compel_proc('("a red cat playing with a ball", "jungle").blend(0.7, 0.8)')
+generator = torch.Generator(device="cuda").manual_seed(33)
+
+image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
+</div>
+
+### Conjunction
+
+A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:
+
+```py
+prompt_embeds = compel_proc('["a red cat", "playing with a", "ball"].and()')
+generator = torch.Generator(device="cuda").manual_seed(55)
+
+image = pipe(prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
+</div>

 ### Textual inversion

@@ -326,63 +363,35 @@ Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textua
 ```py
 import torch
 from diffusers import StableDiffusionPipeline
+from compel import Compel, DiffusersTextualInversionManager

 pipe = StableDiffusionPipeline.from_pretrained(
-  "stable-diffusion-v1-5/stable-diffusion-v1-5",
-  torch_dtype=torch.float16,
-).to("cuda")
+  "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16,
+  use_safetensors=True, variant="fp16").to("cuda")
 pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
 ```

-Add the `<midjourney-style>` text to the prompt to trigger the textual inversion.
+Compel provides a `DiffusersTextualInversionManager` class to simplify prompt weighting with textual inversion. Instantiate `DiffusersTextualInversionManager` and pass it to the `Compel` class:

 ```py
-from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
-
-prompt = """<midjourney-style> A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
+textual_inversion_manager = DiffusersTextualInversionManager(pipe)
+compel_proc = Compel(
+    tokenizer=pipe.tokenizer,
+    text_encoder=pipe.text_encoder,
+    textual_inversion_manager=textual_inversion_manager)
 ```

-Use the `get_weighted_text_embeddings_sd15` function to generate the prompt embeddings and the negative prompt embeddings.
+Incorporate the concept to condition a prompt with using the `<concept>` syntax:

 ```py
-( 
-  prompt_embeds,
-  prompt_neg_embeds,
-) = get_weighted_text_embeddings_sd15(
-    pipe,
-    prompt=prompt,
-    neg_prompt=neg_prompt
-)
+prompt_embeds = compel_proc('("A red cat++ playing with a ball <midjourney-style>")')

-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=prompt_neg_embeds,
-    height=768,
-    width=896,
-    guidance_scale=4.0,
-    generator=torch.Generator("cuda").manual_seed(2)
-).images[0]
+image = pipe(prompt_embeds=prompt_embeds).images[0]
 image
 ```

 <div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_textual_inversion.png"/>
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
 </div>

 ### DreamBooth
@@ -392,44 +401,70 @@ image
 ```py
 import torch
 from diffusers import DiffusionPipeline, UniPCMultistepScheduler
+from compel import Compel

 pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 ```

-Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
+Create a `Compel` class with a tokenizer and text encoder, and pass your prompt to it. Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:

 ```py
-from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
-
-prompt = """dndcoverart of A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
-
-(
-    prompt_embeds
-    , prompt_neg_embeds
-) = get_weighted_text_embeddings_sd15(
-    pipe
-    , prompt = prompt
-    , neg_prompt = neg_prompt
-)
+compel_proc = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
+prompt_embeds = compel_proc('("magazine cover of a dndcoverart dragon, high quality, intricate details, larry elmore art style").and()')
+image = pipe(prompt_embeds=prompt_embeds).images[0]
+image
 ```

 <div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_dreambooth.png"/>
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
+</div>
+
+### Stable Diffusion XL
+
+Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:
+
+```py
+from compel import Compel, ReturnedEmbeddingsType
+from diffusers import DiffusionPipeline
+from diffusers.utils import make_image_grid
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  variant="fp16",
+  use_safetensors=True,
+  torch_dtype=torch.float16
+).to("cuda")
+
+compel = Compel(
+  tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
+  text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
+  returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+  requires_pooled=[False, True]
+)
+```
+
+This time, let's upweight "ball" by a factor of 1.5 for the first prompt, and downweight "ball" by 0.6 for the second prompt. The [`StableDiffusionXLPipeline`] also requires [`pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.pooled_prompt_embeds) (and optionally [`negative_pooled_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline.__call__.negative_pooled_prompt_embeds)) so you should pass those to the pipeline along with the conditioning tensors:
+
+```py
+# apply weights
+prompt = ["a red cat playing with a (ball)1.5", "a red cat playing with a (ball)0.6"]
+conditioning, pooled = compel(prompt)
+
+# generate image
+generator = [torch.Generator().manual_seed(33) for _ in range(len(prompt))]
+images = pipeline(prompt_embeds=conditioning, pooled_prompt_embeds=pooled, generator=generator, num_inference_steps=30).images
+make_image_grid(images, rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball1.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)1.5"</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/sdxl_ball2.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">"a red cat playing with a (ball)0.6"</figcaption>
+  </div>
 </div>
@@ -92,13 +92,9 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        token = kwargs.pop("token", None)
        variant = kwargs.pop("variant", None)
        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)
        device_map = kwargs.pop("device_map", None)

-        if not isinstance(torch_dtype, torch.dtype):
-            torch_dtype = torch.float32
-            print(f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`.")
-
        alpha = kwargs.pop("alpha", 0.5)
        interp = kwargs.pop("interp", None)

@@ -203,7 +203,7 @@ def log_validation(

        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)

-    pipeline = pipeline.to(accelerator.device)
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
    pipeline.set_progress_bar_config(disable=True)

    # run inference
@@ -213,7 +213,7 @@ def log_validation(
    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
        autocast_ctx = nullcontext()
    else:
-        autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+        autocast_ctx = torch.autocast(accelerator.device.type)

    with autocast_ctx:
        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
@@ -345,7 +345,6 @@ else:
            "Lumina2Text2ImgPipeline",
            "LuminaText2ImgPipeline",
            "MarigoldDepthPipeline",
-            "MarigoldIntrinsicsPipeline",
            "MarigoldNormalsPipeline",
            "MochiPipeline",
            "MusicLDMPipeline",
@@ -846,7 +845,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Lumina2Text2ImgPipeline,
            LuminaText2ImgPipeline,
            MarigoldDepthPipeline,
-            MarigoldIntrinsicsPipeline,
            MarigoldNormalsPipeline,
            MochiPipeline,
            MusicLDMPipeline,
@@ -867,7 +865,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableCascadeCombinedPipeline,
            StableCascadeDecoderPipeline,
            StableCascadePriorPipeline,
-            StableDiffusion3ControlNetInpaintingPipeline,
            StableDiffusion3ControlNetPipeline,
            StableDiffusion3Img2ImgPipeline,
            StableDiffusion3InpaintPipeline,
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..models.attention_processor import Attention, MochiAttention
+
+
+_ATTENTION_CLASSES = (Attention, MochiAttention)
+
+_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
+_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
+_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "layers")
+
+_ALL_TRANSFORMER_BLOCK_IDENTIFIERS = tuple(
+    {
+        *_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+        *_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+        *_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
+    }
+)
@@ -0,0 +1,262 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Tuple, Union
+
+import torch
+
+from ..utils import get_logger
+from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS
+from .hooks import HookRegistry, ModelHook
+from .utils import _extract_return_information
+
+
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+
+
+_FBC_LEADER_BLOCK_HOOK = "fbc_leader_block_hook"
+_FBC_BLOCK_HOOK = "fbc_block_hook"
+
+
+@dataclass
+class FirstBlockCacheConfig:
+    r"""
+    Configuration for [First Block
+    Cache](https://github.com/chengzeyi/ParaAttention/blob/7a266123671b55e7e5a2fe9af3121f07a36afc78/README.md#first-block-cache-our-dynamic-caching).
+
+    Args:
+        threshold (`float`, defaults to `0.05`):
+            The threshold to determine whether or not a forward pass through all layers of the model is required. A
+            higher threshold usually results in lower number of forward passes and faster inference, but might lead to
+            poorer generation quality. A lower threshold may not result in significant generation speedup. The
+            threshold is compared against the absmean difference of the residuals between the current and cached
+            outputs from the first transformer block. If the difference is below the threshold, the forward pass is
+            skipped.
+    """
+
+    threshold: float = 0.05
+
+
+class FBCSharedBlockState:
+    def __init__(self) -> None:
+        self.head_block_output: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
+        self.head_block_residual: torch.Tensor = None
+        self.tail_block_residuals: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
+        self.should_compute: bool = True
+
+    def reset(self):
+        self.tail_block_residuals = None
+        self.should_compute = True
+
+    def __repr__(self):
+        return f"FirstBlockCacheSharedState(cache={self.cache})"
+
+
+class FBCHeadBlockHook(ModelHook):
+    _is_stateful = True
+
+    def __init__(self, shared_state: FBCSharedBlockState, threshold: float):
+        self.shared_state = shared_state
+        self.threshold = threshold
+
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        inputs = inspect.signature(module.__class__.forward)
+        inputs_index_to_str = dict(enumerate(inputs.parameters.keys()))
+        inputs_str_to_index = {v: k for k, v in inputs_index_to_str.items()}
+
+        try:
+            outputs = _extract_return_information(module.__class__.forward)
+            outputs_index_to_str = dict(enumerate(outputs))
+            outputs_str_to_index = {v: k for k, v in outputs_index_to_str.items()}
+        except RuntimeError:
+            logger.error(f"Failed to extract return information for {module.__class__}")
+            raise NotImplementedError(
+                f"Module {module.__class__} is not supported with FirstBlockCache. Please open an issue at "
+                f"https://github.com/huggingface/diffusers to notify us about the error with a minimal example "
+                f"in order for us to add support for this module."
+            )
+
+        self._inputs_index_to_str = inputs_index_to_str
+        self._inputs_str_to_index = inputs_str_to_index
+        self._outputs_index_to_str = outputs_index_to_str
+        self._outputs_str_to_index = outputs_str_to_index
+        return module
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        hs_input_idx = self._inputs_str_to_index.get("hidden_states")
+        ehs_input_idx = self._inputs_str_to_index.get("encoder_hidden_states", None)
+        original_hs = kwargs.get("hidden_states", None)
+        original_ehs = kwargs.get("encoder_hidden_states", None)
+        original_hs = original_hs if original_hs is not None else args[hs_input_idx]
+        if ehs_input_idx is not None:
+            original_ehs = original_ehs if original_ehs is not None else args[ehs_input_idx]
+
+        hs_output_idx = self._outputs_str_to_index.get("hidden_states")
+        ehs_output_idx = self._outputs_str_to_index.get("encoder_hidden_states", None)
+        assert (ehs_input_idx is None) == (ehs_output_idx is None)
+
+        output = self.fn_ref.original_forward(*args, **kwargs)
+
+        hs_residual = None
+        if isinstance(output, tuple):
+            hs_residual = output[hs_output_idx] - original_hs
+        else:
+            hs_residual = output - original_hs
+
+        should_compute = self._should_compute_remaining_blocks(hs_residual)
+        self.shared_state.should_compute = should_compute
+
+        hs, ehs = None, None
+        if not should_compute:
+            # Apply caching
+            logger.info("Skipping forward pass through remaining blocks")
+            hs = self.shared_state.tail_block_residuals[0] + output[hs_output_idx]
+            if ehs_output_idx is not None:
+                ehs = self.shared_state.tail_block_residuals[1] + output[ehs_output_idx]
+
+            if isinstance(output, tuple):
+                return_output = [None] * len(output)
+                return_output[hs_output_idx] = hs
+                return_output[ehs_output_idx] = ehs
+                return_output = tuple(return_output)
+            else:
+                return_output = hs
+            return return_output
+        else:
+            logger.info("Computing forward pass through remaining blocks")
+            if isinstance(output, tuple):
+                head_block_output = [None] * len(output)
+                head_block_output[0] = output[hs_output_idx]
+                head_block_output[1] = output[ehs_output_idx]
+            else:
+                head_block_output = output
+            self.shared_state.head_block_output = head_block_output
+            self.shared_state.head_block_residual = hs_residual
+            return output
+
+    def reset_state(self, module):
+        self.shared_state.reset()
+        return module
+
+    def _should_compute_remaining_blocks(self, hs_residual: torch.Tensor) -> bool:
+        if self.shared_state.head_block_residual is None:
+            return True
+        prev_hs_residual = self.shared_state.head_block_residual
+        hs_absmean = (hs_residual - prev_hs_residual).abs().mean()
+        prev_hs_mean = prev_hs_residual.abs().mean()
+        diff = (hs_absmean / prev_hs_mean).item()
+        logger.info(f"Diff: {diff}, Threshold: {self.threshold}")
+        return diff > self.threshold
+
+
+class FBCBlockHook(ModelHook):
+    def __init__(self, shared_state: FBCSharedBlockState, is_tail: bool = False):
+        super().__init__()
+        self.shared_state = shared_state
+        self.is_tail = is_tail
+
+    def initialize_hook(self, module):
+        inputs = inspect.signature(module.__class__.forward)
+        inputs_index_to_str = dict(enumerate(inputs.parameters.keys()))
+        inputs_str_to_index = {v: k for k, v in inputs_index_to_str.items()}
+
+        try:
+            outputs = _extract_return_information(module.__class__.forward)
+            outputs_index_to_str = dict(enumerate(outputs))
+            outputs_str_to_index = {v: k for k, v in outputs_index_to_str.items()}
+        except RuntimeError:
+            logger.error(f"Failed to extract return information for {module.__class__}")
+            raise NotImplementedError(
+                f"Module {module.__class__} is not supported with FirstBlockCache. Please open an issue at "
+                f"https://github.com/huggingface/diffusers to notify us about the error with a minimal example "
+                f"in order for us to add support for this module."
+            )
+
+        self._inputs_index_to_str = inputs_index_to_str
+        self._inputs_str_to_index = inputs_str_to_index
+        self._outputs_index_to_str = outputs_index_to_str
+        self._outputs_str_to_index = outputs_str_to_index
+        return module
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        hs_input_idx = self._inputs_str_to_index.get("hidden_states")
+        ehs_input_idx = self._inputs_str_to_index.get("encoder_hidden_states", None)
+        original_hs = kwargs.get("hidden_states", None)
+        original_ehs = kwargs.get("encoder_hidden_states", None)
+        original_hs = original_hs if original_hs is not None else args[hs_input_idx]
+        if ehs_input_idx is not None:
+            original_ehs = original_ehs if original_ehs is not None else args[ehs_input_idx]
+
+        hs_output_idx = self._outputs_str_to_index.get("hidden_states")
+        ehs_output_idx = self._outputs_str_to_index.get("encoder_hidden_states", None)
+        assert (ehs_input_idx is None) == (ehs_output_idx is None)
+
+        if self.shared_state.should_compute:
+            output = self.fn_ref.original_forward(*args, **kwargs)
+            if self.is_tail:
+                hs_residual, ehs_residual = None, None
+                if isinstance(output, tuple):
+                    hs_residual = output[hs_output_idx] - self.shared_state.head_block_output[0]
+                    ehs_residual = output[ehs_output_idx] - self.shared_state.head_block_output[1]
+                else:
+                    hs_residual = output - self.shared_state.head_block_output
+                self.shared_state.tail_block_residuals = (hs_residual, ehs_residual)
+            return output
+
+        output_count = len(self._outputs_index_to_str.keys())
+        return_output = [None] * output_count if output_count > 1 else original_hs
+        if output_count == 1:
+            return_output = original_hs
+        else:
+            return_output[hs_output_idx] = original_hs
+            return_output[ehs_output_idx] = original_ehs
+        return return_output
+
+
+def apply_first_block_cache(module: torch.nn.Module, config: FirstBlockCacheConfig) -> None:
+    shared_state = FBCSharedBlockState()
+    remaining_blocks = []
+
+    for name, submodule in module.named_children():
+        if name not in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS or not isinstance(submodule, torch.nn.ModuleList):
+            continue
+        for block in submodule:
+            remaining_blocks.append((name, block))
+
+    head_block_name, head_block = remaining_blocks.pop(0)
+    tail_block_name, tail_block = remaining_blocks.pop(-1)
+
+    logger.debug(f"Apply FBCHeadBlockHook to '{head_block_name}'")
+    apply_fbc_head_block_hook(head_block, shared_state, config.threshold)
+
+    for name, block in remaining_blocks:
+        logger.debug(f"Apply FBCBlockHook to '{name}'")
+        apply_fbc_block_hook(block, shared_state)
+
+    logger.debug(f"Apply FBCBlockHook to tail block '{tail_block_name}'")
+    apply_fbc_block_hook(tail_block, shared_state, is_tail=True)
+
+
+def apply_fbc_head_block_hook(block: torch.nn.Module, state: FBCSharedBlockState, threshold: float) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(block)
+    hook = FBCHeadBlockHook(state, threshold)
+    registry.register_hook(hook, _FBC_LEADER_BLOCK_HOOK)
+
+
+def apply_fbc_block_hook(block: torch.nn.Module, state: FBCSharedBlockState, is_tail: bool = False) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(block)
+    hook = FBCBlockHook(state, is_tail)
+    registry.register_hook(hook, _FBC_BLOCK_HOOK)
@@ -20,19 +20,18 @@ import torch

 from ..models.attention_processor import Attention, MochiAttention
 from ..utils import logging
+from ._common import (
+    _ATTENTION_CLASSES,
+    _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
+    _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+    _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+)
 from .hooks import HookRegistry, ModelHook


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


-_ATTENTION_CLASSES = (Attention, MochiAttention)
-
-_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks")
-_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
-_CROSS_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks")
-
-
@dataclass
 class PyramidAttentionBroadcastConfig:
    r"""
@@ -76,9 +75,9 @@ class PyramidAttentionBroadcastConfig:
    temporal_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
    cross_attention_timestep_skip_range: Tuple[int, int] = (100, 800)

-    spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS
-    temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
-    cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_ATTENTION_BLOCK_IDENTIFIERS
+    spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS
+    temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS
+    cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS

    current_timestep_callback: Callable[[], int] = None

@@ -0,0 +1,59 @@
+import ast
+import inspect
+import textwrap
+from typing import List
+
+
+def _extract_return_information(func) -> List[str]:
+    """Extracts return variable names in order from a function."""
+    try:
+        source = inspect.getsource(func)
+        source = textwrap.dedent(source)  # Modify indentation to make parsing compatible
+    except (OSError, TypeError):
+        try:
+            source_file = inspect.getfile(func)
+            with open(source_file, "r", encoding="utf-8") as f:
+                source = f.read()
+
+            # Extract function definition manually
+            source_lines = source.splitlines()
+            func_name = func.__name__
+            start_line = None
+            indent_level = None
+            extracted_lines = []
+
+            for i, line in enumerate(source_lines):
+                stripped = line.strip()
+                if stripped.startswith(f"def {func_name}("):
+                    start_line = i
+                    indent_level = len(line) - len(line.lstrip())
+                    extracted_lines.append(line)
+                    continue
+
+                if start_line is not None:
+                    # Stop when indentation level decreases (end of function)
+                    current_indent = len(line) - len(line.lstrip())
+                    if current_indent <= indent_level and line.strip():
+                        break
+                    extracted_lines.append(line)
+
+            source = "\n".join(extracted_lines)
+        except Exception as e:
+            raise RuntimeError(f"Failed to retrieve function source: {e}")
+
+    # Parse source code using AST
+    tree = ast.parse(source)
+    return_vars = []
+
+    class ReturnVisitor(ast.NodeVisitor):
+        def visit_Return(self, node):
+            if isinstance(node.value, ast.Tuple):
+                # Multiple return values
+                return_vars.extend(var.id for var in node.value.elts if isinstance(var, ast.Name))
+            elif isinstance(node.value, ast.Name):
+                # Single return value
+                return_vars.append(node.value.id)
+
+    visitor = ReturnVisitor()
+    visitor.visit(tree)
+    return return_vars
@@ -23,9 +23,7 @@ from safetensors import safe_open
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
 from ..utils import (
    USE_PEFT_BACKEND,
-    _get_detailed_type,
    _get_model_file,
-    _is_valid_type,
    is_accelerate_available,
    is_torch_version,
    is_transformers_available,
@@ -579,36 +577,29 @@ class FluxIPAdapterMixin:
        pipeline.set_ip_adapter_scale(ip_strengths)
        ```
        """
-
-        scale_type = Union[int, float]
-        num_ip_adapters = self.transformer.encoder_hid_proj.num_ip_adapters
-        num_layers = self.transformer.config.num_layers
-
-        # Single value for all layers of all IP-Adapters
-        if isinstance(scale, scale_type):
-            scale = [scale for _ in range(num_ip_adapters)]
-        # List of per-layer scales for a single IP-Adapter
-        elif _is_valid_type(scale, List[scale_type]) and num_ip_adapters == 1:
+        transformer = self.transformer
+        if not isinstance(scale, list):
+            scale = [[scale] * transformer.config.num_layers]
+        elif isinstance(scale, list) and isinstance(scale[0], int) or isinstance(scale[0], float):
+            if len(scale) != transformer.config.num_layers:
+                raise ValueError(f"Expected list of {transformer.config.num_layers} scales, got {len(scale)}.")
            scale = [scale]
-        # Invalid scale type
-        elif not _is_valid_type(scale, List[Union[scale_type, List[scale_type]]]):
-            raise TypeError(f"Unexpected type {_get_detailed_type(scale)} for scale.")

-        if len(scale) != num_ip_adapters:
-            raise ValueError(f"Cannot assign {len(scale)} scales to {num_ip_adapters} IP-Adapters.")
+        scale_configs = scale

-        if any(len(s) != num_layers for s in scale if isinstance(s, list)):
-            invalid_scale_sizes = {len(s) for s in scale if isinstance(s, list)} - {num_layers}
-            raise ValueError(
-                f"Expected list of {num_layers} scales, got {', '.join(str(x) for x in invalid_scale_sizes)}."
-            )
-
-        # Scalars are transformed to lists with length num_layers
-        scale_configs = [[s] * num_layers if isinstance(s, scale_type) else s for s in scale]
-
-        # Set scales. zip over scale_configs prevents going into single transformer layers
-        for attn_processor, *scale in zip(self.transformer.attn_processors.values(), *scale_configs):
-            attn_processor.scale = scale
+        key_id = 0
+        for attn_name, attn_processor in transformer.attn_processors.items():
+            if isinstance(attn_processor, (FluxIPAdapterJointAttnProcessor2_0)):
+                if len(scale_configs) != len(attn_processor.scale):
+                    raise ValueError(
+                        f"Cannot assign {len(scale_configs)} scale_configs to "
+                        f"{len(attn_processor.scale)} IP-Adapter."
+                    )
+                elif len(scale_configs) == 1:
+                    scale_configs = scale_configs * len(attn_processor.scale)
+                for i, scale_config in enumerate(scale_configs):
+                    attn_processor.scale[i] = scale_config[key_id]
+                key_id += 1

    def unload_ip_adapter(self):
        """
@@ -63,9 +63,6 @@ def _maybe_adjust_config(config):
    method removes the ambiguity by following what is described here:
    https://github.com/huggingface/diffusers/pull/9985#issuecomment-2493840028.
    """
-    # Track keys that have been explicitly removed to prevent re-adding them.
-    deleted_keys = set()
-
    rank_pattern = config["rank_pattern"].copy()
    target_modules = config["target_modules"]
    original_r = config["r"]
@@ -83,22 +80,21 @@ def _maybe_adjust_config(config):
        ambiguous_key = key

        if exact_matches and substring_matches:
-            # if ambiguous, update the rank associated with the ambiguous key (`proj_out`, for example)
+            # if ambiguous we update the rank associated with the ambiguous key (`proj_out`, for example)
            config["r"] = key_rank
-            # remove the ambiguous key from `rank_pattern` and record it as deleted
+            # remove the ambiguous key from `rank_pattern` and update its rank to `r`, instead
            del config["rank_pattern"][key]
-            deleted_keys.add(key)
-            # For substring matches, add them with the original rank only if they haven't been assigned already
            for mod in substring_matches:
-                if mod not in config["rank_pattern"] and mod not in deleted_keys:
+                # avoid overwriting if the module already has a specific rank
+                if mod not in config["rank_pattern"]:
                    config["rank_pattern"][mod] = original_r

-            # Update the rest of the target modules with the original rank if not already set and not deleted
+            # update the rest of the keys with the `original_r`
            for mod in target_modules:
-                if mod != ambiguous_key and mod not in config["rank_pattern"] and mod not in deleted_keys:
+                if mod != ambiguous_key and mod not in config["rank_pattern"]:
                    config["rank_pattern"][mod] = original_r

-    # Handle alphas to deal with cases like:
+    # handle alphas to deal with cases like
    # https://github.com/huggingface/diffusers/pull/9999#issuecomment-2516180777
    has_different_ranks = len(config["rank_pattern"]) > 1 and list(config["rank_pattern"])[0] != config["r"]
    if has_different_ranks:
@@ -191,11 +187,6 @@ class PeftAdapterMixin:
        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
        from peft.tuners.tuners_utils import BaseTunerLayer

-        try:
-            from peft.utils.constants import FULLY_QUALIFIED_PATTERN_KEY_PREFIX
-        except ImportError:
-            FULLY_QUALIFIED_PATTERN_KEY_PREFIX = None
-
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
@@ -260,22 +251,14 @@ class PeftAdapterMixin:
                # Cannot figure out rank from lora layers that don't have atleast 2 dimensions.
                # Bias layers in LoRA only have a single dimension
                if "lora_B" in key and val.ndim > 1:
-                    # Support to handle cases where layer patterns are treated as full layer names
-                    # was added later in PEFT. So, we handle it accordingly.
-                    # TODO: when we fix the minimal PEFT version for Diffusers,
-                    # we should remove `_maybe_adjust_config()`.
-                    if FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
-                        rank[f"{FULLY_QUALIFIED_PATTERN_KEY_PREFIX}{key}"] = val.shape[1]
-                    else:
-                        rank[key] = val.shape[1]
+                    rank[key] = val.shape[1]

            if network_alphas is not None and len(network_alphas) >= 1:
                alpha_keys = [k for k in network_alphas.keys() if k.startswith(f"{prefix}.")]
                network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}

            lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
-            if not FULLY_QUALIFIED_PATTERN_KEY_PREFIX:
-                lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)
+            lora_config_kwargs = _maybe_adjust_config(lora_config_kwargs)

            if "use_dora" in lora_config_kwargs:
                if lora_config_kwargs["use_dora"]:
@@ -360,17 +360,11 @@ class FromSingleFileMixin:
        cache_dir = kwargs.pop("cache_dir", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)
        disable_mmap = kwargs.pop("disable_mmap", False)

        is_legacy_loading = False

-        if not isinstance(torch_dtype, torch.dtype):
-            torch_dtype = torch.float32
-            logger.warning(
-                f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
-            )
-
        # We shouldn't allow configuring individual models components through a Pipeline creation method
        # These model kwargs should be deprecated
        scaling_factor = kwargs.get("scaling_factor", None)
@@ -240,17 +240,11 @@ class FromOriginalModelMixin:
        subfolder = kwargs.pop("subfolder", None)
        revision = kwargs.pop("revision", None)
        config_revision = kwargs.pop("config_revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)
        quantization_config = kwargs.pop("quantization_config", None)
        device = kwargs.pop("device", None)
        disable_mmap = kwargs.pop("disable_mmap", False)

-        if not isinstance(torch_dtype, torch.dtype):
-            torch_dtype = torch.float32
-            logger.warning(
-                f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
-            )
-
        if isinstance(pretrained_model_link_or_path_or_dict, dict):
            checkpoint = pretrained_model_link_or_path_or_dict
        else:
@@ -213,9 +213,7 @@ class Attention(nn.Module):
            self.norm_q = LpNorm(p=2, dim=-1, eps=eps)
            self.norm_k = LpNorm(p=2, dim=-1, eps=eps)
        else:
-            raise ValueError(
-                f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
-            )
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")

        if cross_attention_norm is None:
            self.norm_cross = None
@@ -1410,7 +1408,7 @@ class JointAttnProcessor2_0:

    def __init__(self):
        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("JointAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")

    def __call__(
        self,
@@ -2780,8 +2778,9 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):

            # IP-adapter
            ip_query = hidden_states_query_proj
-            ip_attn_output = torch.zeros_like(hidden_states)
-
+            ip_attn_output = None
+            # for ip-adapter
+            # TODO: support for multiple adapters
            for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
                ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
            ):
@@ -2792,14 +2791,12 @@ class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
                ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
                # the output of sdp = (batch, num_heads, seq_len, head_dim)
                # TODO: add support for attn.scale when we move to Torch 2.1
-                current_ip_hidden_states = F.scaled_dot_product_attention(
+                ip_attn_output = F.scaled_dot_product_attention(
                    ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
                )
-                current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
-                    batch_size, -1, attn.heads * head_dim
-                )
-                current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
-                ip_attn_output += scale * current_ip_hidden_states
+                ip_attn_output = ip_attn_output.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+                ip_attn_output = scale * ip_attn_output
+                ip_attn_output = ip_attn_output.to(ip_query.dtype)

            return hidden_states, encoder_hidden_states, ip_attn_output
        else:
@@ -40,48 +40,6 @@ class SD3ControlNetOutput(BaseOutput):


 class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
-    r"""
-    ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
-
-    Parameters:
-        sample_size (`int`, defaults to `128`):
-            The width/height of the latents. This is fixed during training since it is used to learn a number of
-            position embeddings.
-        patch_size (`int`, defaults to `2`):
-            Patch size to turn the input data into small patches.
-        in_channels (`int`, defaults to `16`):
-            The number of latent channels in the input.
-        num_layers (`int`, defaults to `18`):
-            The number of layers of transformer blocks to use.
-        attention_head_dim (`int`, defaults to `64`):
-            The number of channels in each head.
-        num_attention_heads (`int`, defaults to `18`):
-            The number of heads to use for multi-head attention.
-        joint_attention_dim (`int`, defaults to `4096`):
-            The embedding dimension to use for joint text-image attention.
-        caption_projection_dim (`int`, defaults to `1152`):
-            The embedding dimension of caption embeddings.
-        pooled_projection_dim (`int`, defaults to `2048`):
-            The embedding dimension of pooled text projections.
-        out_channels (`int`, defaults to `16`):
-            The number of latent channels in the output.
-        pos_embed_max_size (`int`, defaults to `96`):
-            The maximum latent height/width of positional embeddings.
-        extra_conditioning_channels (`int`, defaults to `0`):
-            The number of extra channels to use for conditioning for patch embedding.
-        dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
-            The number of dual-stream transformer blocks to use.
-        qk_norm (`str`, *optional*, defaults to `None`):
-            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
-        pos_embed_type (`str`, defaults to `"sincos"`):
-            The type of positional embedding to use. Choose between `"sincos"` and `None`.
-        use_pos_embed (`bool`, defaults to `True`):
-            Whether to use positional embeddings.
-        force_zeros_for_pooled_projection (`bool`, defaults to `True`):
-            Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
-            config value of the ControlNet model.
-    """
-
    _supports_gradient_checkpointing = True

    @register_to_config
@@ -135,7 +93,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
                    JointTransformerBlock(
                        dim=self.inner_dim,
                        num_attention_heads=num_attention_heads,
-                        attention_head_dim=attention_head_dim,
+                        attention_head_dim=self.config.attention_head_dim,
                        context_pre_only=False,
                        qk_norm=qk_norm,
                        use_dual_attention=True if i in dual_attention_layers else False,
@@ -150,7 +108,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
                    SD3SingleTransformerBlock(
                        dim=self.inner_dim,
                        num_attention_heads=num_attention_heads,
-                        attention_head_dim=attention_head_dim,
+                        attention_head_dim=self.config.attention_head_dim,
                    )
                    for _ in range(num_layers)
                ]
@@ -339,28 +297,28 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal

    def forward(
        self,
-        hidden_states: torch.Tensor,
+        hidden_states: torch.FloatTensor,
        controlnet_cond: torch.Tensor,
        conditioning_scale: float = 1.0,
-        encoder_hidden_states: torch.Tensor = None,
-        pooled_projections: torch.Tensor = None,
+        encoder_hidden_states: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
        timestep: torch.LongTensor = None,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
-    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
        """
        The [`SD3Transformer2DModel`] forward method.

        Args:
-            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
@@ -479,11 +437,11 @@ class SD3MultiControlNetModel(ModelMixin):

    def forward(
        self,
-        hidden_states: torch.Tensor,
+        hidden_states: torch.FloatTensor,
        controlnet_cond: List[torch.tensor],
        conditioning_scale: List[float],
-        pooled_projections: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
        timestep: torch.LongTensor = None,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
@@ -2583,11 +2583,6 @@ class MultiIPAdapterImageProjection(nn.Module):
        super().__init__()
        self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)

-    @property
-    def num_ip_adapters(self) -> int:
-        """Number of IP-Adapters loaded."""
-        return len(self.image_projection_layers)
-
    def forward(self, image_embeds: List[torch.Tensor]):
        projected_image_embeds = []

@@ -866,7 +866,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)
        subfolder = kwargs.pop("subfolder", None)
        device_map = kwargs.pop("device_map", None)
        max_memory = kwargs.pop("max_memory", None)
@@ -879,12 +879,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
        disable_mmap = kwargs.pop("disable_mmap", False)

-        if not isinstance(torch_dtype, torch.dtype):
-            torch_dtype = torch.float32
-            logger.warning(
-                f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
-            )
-
        allow_pickle = False
        if use_safetensors is None:
            use_safetensors = True
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
@@ -31,7 +32,7 @@ from ...models.attention_processor import (
 )
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
-from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.import_utils import is_torch_npu_available
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..cache_utils import CacheMixin
@@ -44,7 +45,20 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

@maybe_allow_in_graph
 class FluxSingleTransformerBlock(nn.Module):
-    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
        super().__init__()
        self.mlp_hidden_dim = int(dim * mlp_ratio)

@@ -54,15 +68,9 @@ class FluxSingleTransformerBlock(nn.Module):
        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)

        if is_torch_npu_available():
-            deprecation_message = (
-                "Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
-                "should be set explicitly using the `set_attn_processor` method."
-            )
-            deprecate("npu_processor", "0.34.0", deprecation_message)
            processor = FluxAttnProcessor2_0_NPU()
        else:
            processor = FluxAttnProcessor2_0()
-
        self.attn = Attention(
            query_dim=dim,
            cross_attention_dim=None,
@@ -79,10 +87,13 @@ class FluxSingleTransformerBlock(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> torch.Tensor:
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
        residual = hidden_states
        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
@@ -100,19 +111,47 @@ class FluxSingleTransformerBlock(nn.Module):
        if hidden_states.dtype == torch.float16:
            hidden_states = hidden_states.clip(-65504, 65504)

-        return hidden_states
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [encoder_hidden_states.size(1), hidden_states.size(1) - encoder_hidden_states.size(1)], dim=1
+        )
+        return hidden_states, encoder_hidden_states


@maybe_allow_in_graph
 class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Args:
+        dim (`int`):
+            The embedding dimension of the block.
+        num_attention_heads (`int`):
+            The number of attention heads to use.
+        attention_head_dim (`int`):
+            The number of dimensions to use for each attention head.
+        qk_norm (`str`, defaults to `"rms_norm"`):
+            The normalization to use for the query and key tensors.
+        eps (`float`, defaults to `1e-6`):
+            The epsilon value to use for the normalization.
+    """
+
    def __init__(
        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
    ):
        super().__init__()

        self.norm1 = AdaLayerNormZero(dim)
+
        self.norm1_context = AdaLayerNormZero(dim)

+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
        self.attn = Attention(
            query_dim=dim,
            cross_attention_dim=None,
@@ -122,7 +161,7 @@ class FluxTransformerBlock(nn.Module):
            out_dim=dim,
            context_pre_only=False,
            bias=True,
-            processor=FluxAttnProcessor2_0(),
+            processor=processor,
            qk_norm=qk_norm,
            eps=eps,
        )
@@ -133,6 +172,10 @@ class FluxTransformerBlock(nn.Module):
        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")

+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -187,7 +230,7 @@ class FluxTransformerBlock(nn.Module):
        if encoder_hidden_states.dtype == torch.float16:
            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)

-        return encoder_hidden_states, hidden_states
+        return hidden_states, encoder_hidden_states


 class FluxTransformer2DModel(
@@ -480,7 +523,7 @@ class FluxTransformer2DModel(

        for index_block, block in enumerate(self.transformer_blocks):
            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
                    block,
                    hidden_states,
                    encoder_hidden_states,
@@ -489,7 +532,7 @@ class FluxTransformer2DModel(
                )

            else:
-                encoder_hidden_states, hidden_states = block(
+                hidden_states, encoder_hidden_states = block(
                    hidden_states=hidden_states,
                    encoder_hidden_states=encoder_hidden_states,
                    temb=temb,
@@ -508,20 +551,21 @@ class FluxTransformer2DModel(
                    )
                else:
                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)

        for index_block, block in enumerate(self.single_transformer_blocks):
            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(
+                hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
                    block,
                    hidden_states,
+                    encoder_hidden_states,
                    temb,
                    image_rotary_emb,
                )

            else:
-                hidden_states = block(
+                hidden_states, encoder_hidden_states = block(
                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
                    temb=temb,
                    image_rotary_emb=image_rotary_emb,
                    joint_attention_kwargs=joint_attention_kwargs,
@@ -531,12 +575,7 @@ class FluxTransformer2DModel(
            if controlnet_single_block_samples is not None:
                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
                interval_control = int(np.ceil(interval_control))
-                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
-                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
-                    + controlnet_single_block_samples[index_block // interval_control]
-                )
-
-        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                hidden_states = hidden_states + controlnet_single_block_samples[index_block // interval_control]

        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)
@@ -15,6 +15,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
+import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, SD3Transformer2DLoadersMixin
@@ -38,6 +39,17 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

@maybe_allow_in_graph
 class SD3SingleTransformerBlock(nn.Module):
+    r"""
+    A Single Transformer block as part of the MMDiT architecture, used in Stable Diffusion 3 ControlNet.
+
+    Reference: https://arxiv.org/abs/2403.03206
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+    """
+
    def __init__(
        self,
        dim: int,
@@ -47,13 +59,21 @@ class SD3SingleTransformerBlock(nn.Module):
        super().__init__()

        self.norm1 = AdaLayerNormZero(dim)
+
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = JointAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+
        self.attn = Attention(
            query_dim=dim,
            dim_head=attention_head_dim,
            heads=num_attention_heads,
            out_dim=dim,
            bias=True,
-            processor=JointAttnProcessor2_0(),
+            processor=processor,
            eps=1e-6,
        )

@@ -61,17 +81,23 @@ class SD3SingleTransformerBlock(nn.Module):
        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")

    def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor):
-        # 1. Attention
        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-        attn_output = self.attn(hidden_states=norm_hidden_states, encoder_hidden_states=None)
+        # Attention.
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+        )
+
+        # Process attention outputs for the `hidden_states`.
        attn_output = gate_msa.unsqueeze(1) * attn_output
        hidden_states = hidden_states + attn_output

-        # 2. Feed Forward
        norm_hidden_states = self.norm2(hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
        ff_output = self.ff(norm_hidden_states)
        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
        hidden_states = hidden_states + ff_output

        return hidden_states
@@ -81,40 +107,26 @@ class SD3Transformer2DModel(
    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, SD3Transformer2DLoadersMixin
 ):
    """
-    The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
+    The Transformer model introduced in Stable Diffusion 3.
+
+    Reference: https://arxiv.org/abs/2403.03206

    Parameters:
-        sample_size (`int`, defaults to `128`):
-            The width/height of the latents. This is fixed during training since it is used to learn a number of
-            position embeddings.
-        patch_size (`int`, defaults to `2`):
-            Patch size to turn the input data into small patches.
-        in_channels (`int`, defaults to `16`):
-            The number of latent channels in the input.
-        num_layers (`int`, defaults to `18`):
-            The number of layers of transformer blocks to use.
-        attention_head_dim (`int`, defaults to `64`):
-            The number of channels in each head.
-        num_attention_heads (`int`, defaults to `18`):
-            The number of heads to use for multi-head attention.
-        joint_attention_dim (`int`, defaults to `4096`):
-            The embedding dimension to use for joint text-image attention.
-        caption_projection_dim (`int`, defaults to `1152`):
-            The embedding dimension of caption embeddings.
-        pooled_projection_dim (`int`, defaults to `2048`):
-            The embedding dimension of pooled text projections.
-        out_channels (`int`, defaults to `16`):
-            The number of latent channels in the output.
-        pos_embed_max_size (`int`, defaults to `96`):
-            The maximum latent height/width of positional embeddings.
-        dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
-            The number of dual-stream transformer blocks to use.
-        qk_norm (`str`, *optional*, defaults to `None`):
-            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of Transformer blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        caption_projection_dim (`int`): Number of dimensions to use when projecting the `encoder_hidden_states`.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        out_channels (`int`, defaults to 16): Number of output channels.
+
    """

    _supports_gradient_checkpointing = True
-    _no_split_modules = ["JointTransformerBlock"]
    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]

    @register_to_config
@@ -137,33 +149,36 @@ class SD3Transformer2DModel(
        qk_norm: Optional[str] = None,
    ):
        super().__init__()
-        self.out_channels = out_channels if out_channels is not None else in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
+        default_out_channels = in_channels
+        self.out_channels = out_channels if out_channels is not None else default_out_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim

        self.pos_embed = PatchEmbed(
-            height=sample_size,
-            width=sample_size,
-            patch_size=patch_size,
-            in_channels=in_channels,
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
            embed_dim=self.inner_dim,
            pos_embed_max_size=pos_embed_max_size,  # hard-code for now.
        )
        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
-            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
        )
-        self.context_embedder = nn.Linear(joint_attention_dim, caption_projection_dim)
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)

+        # `attention_head_dim` is doubled to account for the mixing.
+        # It needs to crafted when we get the actual checkpoints.
        self.transformer_blocks = nn.ModuleList(
            [
                JointTransformerBlock(
                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
                    context_pre_only=i == num_layers - 1,
                    qk_norm=qk_norm,
                    use_dual_attention=True if i in dual_attention_layers else False,
                )
-                for i in range(num_layers)
+                for i in range(self.config.num_layers)
            ]
        )

@@ -316,24 +331,24 @@ class SD3Transformer2DModel(

    def forward(
        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        pooled_projections: torch.Tensor = None,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
        timestep: torch.LongTensor = None,
        block_controlnet_hidden_states: List = None,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
        skip_layers: Optional[List[int]] = None,
-    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
        """
        The [`SD3Transformer2DModel`] forward method.

        Args:
-            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
                Embeddings projected from the embeddings of input conditions.
            timestep (`torch.LongTensor`):
                Used to indicate denoising step.
@@ -261,7 +261,6 @@ else:
    _import_structure["marigold"].extend(
        [
            "MarigoldDepthPipeline",
-            "MarigoldIntrinsicsPipeline",
            "MarigoldNormalsPipeline",
        ]
    )
@@ -604,7 +603,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .lumina2 import Lumina2Text2ImgPipeline
        from .marigold import (
            MarigoldDepthPipeline,
-            MarigoldIntrinsicsPipeline,
            MarigoldNormalsPipeline,
        )
        from .mochi import MochiPipeline
@@ -224,7 +224,7 @@ class AnimateDiffVideoToVideoPipeline(
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
-        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        unet: UNet2DConditionModel,
        motion_adapter: MotionAdapter,
        scheduler: Union[
            DDIMScheduler,
@@ -246,7 +246,7 @@ class AnimateDiffVideoToVideoControlNetPipeline(
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
-        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        unet: UNet2DConditionModel,
        motion_adapter: MotionAdapter,
        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
        scheduler: Union[
@@ -34,10 +34,6 @@ from .controlnet import (
    StableDiffusionXLControlNetUnionInpaintPipeline,
    StableDiffusionXLControlNetUnionPipeline,
 )
-from .controlnet_sd3 import (
-    StableDiffusion3ControlNetInpaintingPipeline,
-    StableDiffusion3ControlNetPipeline,
-)
 from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
 from .flux import (
    FluxControlImg2ImgPipeline,
@@ -124,7 +120,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
        ("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionPipeline),
-        ("stable-diffusion-3-controlnet", StableDiffusion3ControlNetPipeline),
        ("wuerstchen", WuerstchenCombinedPipeline),
        ("cascade", StableCascadeCombinedPipeline),
        ("lcm", LatentConsistencyModelPipeline),
@@ -183,7 +178,6 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
        ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
        ("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionInpaintPipeline),
-        ("stable-diffusion-3-controlnet", StableDiffusion3ControlNetInpaintingPipeline),
        ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
        ("flux", FluxInpaintPipeline),
        ("flux-controlnet", FluxControlNetInpaintPipeline),
@@ -207,7 +207,7 @@ class StableDiffusionControlNetPipeline(
    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "image"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
@@ -1323,7 +1323,6 @@ class StableDiffusionControlNetPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    image = callback_outputs.pop("image", image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -185,7 +185,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
@@ -1294,7 +1294,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -184,7 +184,7 @@ class StableDiffusionControlNetInpaintPipeline(
    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "control_image"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
        self,
@@ -1476,7 +1476,6 @@ class StableDiffusionControlNetInpaintPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -237,7 +237,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
        "add_neg_time_ids",
        "mask",
        "masked_image_latents",
-        "control_image",
    ]

    def __init__(
@@ -744,7 +743,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
        if padding_mask_crop is not None:
            if not isinstance(image, PIL.Image.Image):
                raise ValueError(
-                    f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
                )
            if not isinstance(mask_image, PIL.Image.Image):
                raise ValueError(
@@ -752,7 +751,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    f" {type(mask_image)}."
                )
            if output_type != "pil":
-                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")

        if prompt_embeds is not None and pooled_prompt_embeds is None:
            raise ValueError(
@@ -1645,7 +1644,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
                    " `pipeline.unet` or your `mask_image` or `image` input."
                )
        elif num_channels_unet != 4:
@@ -1836,7 +1835,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -242,7 +242,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
        "add_time_ids",
        "negative_pooled_prompt_embeds",
        "add_neg_time_ids",
-        "control_image",
    ]

    def __init__(
@@ -1615,7 +1614,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                    )
                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -219,7 +219,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
        "add_time_ids",
        "mask",
        "masked_image_latents",
-        "control_image",
    ]

    def __init__(
@@ -727,7 +726,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
        if padding_mask_crop is not None:
            if not isinstance(image, PIL.Image.Image):
                raise ValueError(
-                    f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
                )
            if not isinstance(mask_image, PIL.Image.Image):
                raise ValueError(
@@ -735,7 +734,7 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
                    f" {type(mask_image)}."
                )
            if output_type != "pil":
-                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")

        if prompt_embeds is not None and pooled_prompt_embeds is None:
            raise ValueError(
@@ -1744,7 +1743,6 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -252,7 +252,12 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
        "feature_extractor",
        "image_encoder",
    ]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "add_text_embeds", "add_time_ids", "control_image"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+    ]

    def __init__(
        self,
@@ -1557,7 +1562,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
-                    control_image = callback_outputs.pop("control_image", control_image)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -232,8 +232,8 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
            Tuple[HunyuanDiT2DControlNetModel],
            HunyuanDiT2DMultiControlNetModel,
        ],
-        text_encoder_2: Optional[T5EncoderModel] = None,
-        tokenizer_2: Optional[MT5Tokenizer] = None,
+        text_encoder_2=T5EncoderModel,
+        tokenizer_2=MT5Tokenizer,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
 from transformers import (
+    BaseImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
-    SiglipImageProcessor,
-    SiglipVisionModel,
+    PreTrainedModel,
    T5EncoderModel,
    T5TokenizerFast,
 )
@@ -178,9 +178,9 @@ class StableDiffusion3ControlNetPipeline(
            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
            additional conditioning.
-        image_encoder (`SiglipVisionModel`, *optional*):
+        image_encoder (`PreTrainedModel`, *optional*):
            Pre-trained Vision Model for IP Adapter.
-        feature_extractor (`SiglipImageProcessor`, *optional*):
+        feature_extractor (`BaseImageProcessor`, *optional*):
            Image processor for IP Adapter.
    """

@@ -202,8 +202,8 @@ class StableDiffusion3ControlNetPipeline(
        controlnet: Union[
            SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
        ],
-        image_encoder: Optional[SiglipVisionModel] = None,
-        feature_extractor: Optional[SiglipImageProcessor] = None,
+        image_encoder: PreTrainedModel = None,
+        feature_extractor: BaseImageProcessor = None,
    ):
        super().__init__()
        if isinstance(controlnet, (list, tuple)):
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
 from transformers import (
+    BaseImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
-    SiglipImageProcessor,
-    SiglipModel,
+    PreTrainedModel,
    T5EncoderModel,
    T5TokenizerFast,
 )
@@ -223,8 +223,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(
        controlnet: Union[
            SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
        ],
-        image_encoder: SiglipModel = None,
-        feature_extractor: Optional[SiglipImageProcessor] = None,
+        image_encoder: PreTrainedModel = None,
+        feature_extractor: BaseImageProcessor = None,
    ):
        super().__init__()

@@ -17,8 +17,6 @@ from typing import List, Optional, Tuple, Union

 import torch

-from ...models import UNet1DModel
-from ...schedulers import SchedulerMixin
 from ...utils import is_torch_xla_available, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
@@ -51,7 +49,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):

    model_cpu_offload_seq = "unet"

-    def __init__(self, unet: UNet1DModel, scheduler: SchedulerMixin):
+    def __init__(self, unet, scheduler):
        super().__init__()
        self.register_modules(unet=unet, scheduler=scheduler)

@@ -16,7 +16,6 @@ from typing import List, Optional, Tuple, Union

 import torch

-from ...models import UNet2DModel
 from ...schedulers import DDIMScheduler
 from ...utils import is_torch_xla_available
 from ...utils.torch_utils import randn_tensor
@@ -48,7 +47,7 @@ class DDIMPipeline(DiffusionPipeline):

    model_cpu_offload_seq = "unet"

-    def __init__(self, unet: UNet2DModel, scheduler: DDIMScheduler):
+    def __init__(self, unet, scheduler):
        super().__init__()

        # make sure scheduler can always be converted to DDIM
@@ -17,8 +17,6 @@ from typing import List, Optional, Tuple, Union

 import torch

-from ...models import UNet2DModel
-from ...schedulers import DDPMScheduler
 from ...utils import is_torch_xla_available
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -49,7 +47,7 @@ class DDPMPipeline(DiffusionPipeline):

    model_cpu_offload_seq = "unet"

-    def __init__(self, unet: UNet2DModel, scheduler: DDPMScheduler):
+    def __init__(self, unet, scheduler):
        super().__init__()
        self.register_modules(unet=unet, scheduler=scheduler)

@@ -91,7 +91,7 @@ class RePaintPipeline(DiffusionPipeline):
    scheduler: RePaintScheduler
    model_cpu_offload_seq = "unet"

-    def __init__(self, unet: UNet2DModel, scheduler: RePaintScheduler):
+    def __init__(self, unet, scheduler):
        super().__init__()
        self.register_modules(unet=unet, scheduler=scheduler)

@@ -405,28 +405,23 @@ class FluxPipeline(
            if not isinstance(ip_adapter_image, list):
                ip_adapter_image = [ip_adapter_image]

-            if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
+            if len(ip_adapter_image) != len(self.transformer.encoder_hid_proj.image_projection_layers):
                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.transformer.encoder_hid_proj.image_projection_layers)} IP Adapters."
                )

-            for single_ip_adapter_image in ip_adapter_image:
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.transformer.encoder_hid_proj.image_projection_layers
+            ):
                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+
                image_embeds.append(single_image_embeds[None, :])
        else:
-            if not isinstance(ip_adapter_image_embeds, list):
-                ip_adapter_image_embeds = [ip_adapter_image_embeds]
-
-            if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
-                raise ValueError(
-                    f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
-                )
-
            for single_image_embeds in ip_adapter_image_embeds:
                image_embeds.append(single_image_embeds)

        ip_adapter_image_embeds = []
-        for single_image_embeds in image_embeds:
+        for i, single_image_embeds in enumerate(image_embeds):
            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
            single_image_embeds = single_image_embeds.to(device=device)
            ip_adapter_image_embeds.append(single_image_embeds)
@@ -877,13 +872,10 @@ class FluxPipeline(
            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
        ):
            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
-            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
-
        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
        ):
            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
-            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters

        if self.joint_attention_kwargs is None:
            self._joint_attention_kwargs = {}
@@ -207,8 +207,8 @@ class HunyuanDiTPipeline(DiffusionPipeline):
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
        requires_safety_checker: bool = True,
-        text_encoder_2: Optional[T5EncoderModel] = None,
-        tokenizer_2: Optional[MT5Tokenizer] = None,
+        text_encoder_2=T5EncoderModel,
+        tokenizer_2=MT5Tokenizer,
    ):
        super().__init__()

@@ -20,7 +20,7 @@ import urllib.parse as ul
 from typing import Callable, Dict, List, Optional, Tuple, Union

 import torch
-from transformers import GemmaPreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+from transformers import AutoModel, AutoTokenizer

 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import VaeImageProcessor
@@ -144,10 +144,13 @@ class LuminaText2ImgPipeline(DiffusionPipeline):
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`GemmaPreTrainedModel`]):
-            Frozen Gemma text-encoder.
-        tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
-            Gemma tokenizer.
+        text_encoder ([`AutoModel`]):
+            Frozen text-encoder. Lumina-T2I uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`AutoModel`):
+            Tokenizer of class
+            [AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel).
        transformer ([`Transformer2DModel`]):
            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
@@ -182,8 +185,8 @@ class LuminaText2ImgPipeline(DiffusionPipeline):
        transformer: LuminaNextDiT2DModel,
        scheduler: FlowMatchEulerDiscreteScheduler,
        vae: AutoencoderKL,
-        text_encoder: GemmaPreTrainedModel,
-        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
+        text_encoder: AutoModel,
+        tokenizer: AutoTokenizer,
    ):
        super().__init__()

@@ -17,7 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import numpy as np
 import torch
-from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+from transformers import AutoModel, AutoTokenizer

 from ...image_processor import VaeImageProcessor
 from ...loaders import Lumina2LoraLoaderMixin
@@ -143,10 +143,13 @@ class Lumina2Text2ImgPipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`Gemma2PreTrainedModel`]):
-            Frozen Gemma2 text-encoder.
-        tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
-            Gemma tokenizer.
+        text_encoder ([`AutoModel`]):
+            Frozen text-encoder. Lumina-T2I uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`AutoModel`):
+            Tokenizer of class
+            [AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel).
        transformer ([`Transformer2DModel`]):
            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
@@ -162,8 +165,8 @@ class Lumina2Text2ImgPipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
        transformer: Lumina2Transformer2DModel,
        scheduler: FlowMatchEulerDiscreteScheduler,
        vae: AutoencoderKL,
-        text_encoder: Gemma2PreTrainedModel,
-        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
+        text_encoder: AutoModel,
+        tokenizer: AutoTokenizer,
    ):
        super().__init__()

@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
    _import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
-    _import_structure["pipeline_marigold_intrinsics"] = ["MarigoldIntrinsicsOutput", "MarigoldIntrinsicsPipeline"]
    _import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,7 +35,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .marigold_image_processing import MarigoldImageProcessor
        from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
-        from .pipeline_marigold_intrinsics import MarigoldIntrinsicsOutput, MarigoldIntrinsicsPipeline
        from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline

 else:
@@ -1,22 +1,4 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
-# --------------------------------------------------------------------------
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import PIL
@@ -397,7 +379,7 @@ class MarigoldImageProcessor(ConfigMixin):
        val_min: float = 0.0,
        val_max: float = 1.0,
        color_map: str = "Spectral",
-    ) -> List[PIL.Image.Image]:
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
        """
        Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.

@@ -409,7 +391,7 @@ class MarigoldImageProcessor(ConfigMixin):
            color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
                      depth prediction into colored representation.

-        Returns: `List[PIL.Image.Image]` with depth maps visualization.
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
        """
        if val_max <= val_min:
            raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
@@ -454,7 +436,7 @@ class MarigoldImageProcessor(ConfigMixin):
        depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
        val_min: float = 0.0,
        val_max: float = 1.0,
-    ) -> List[PIL.Image.Image]:
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
        def export_depth_to_16bit_png_one(img, idx=None):
            prefix = "Depth" + (f"[{idx}]" if idx else "")
            if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
@@ -496,7 +478,7 @@ class MarigoldImageProcessor(ConfigMixin):
        flip_x: bool = False,
        flip_y: bool = False,
        flip_z: bool = False,
-    ) -> List[PIL.Image.Image]:
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
        """
        Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.

@@ -510,7 +492,7 @@ class MarigoldImageProcessor(ConfigMixin):
            flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
                      Default direction is facing the observer.

-        Returns: `List[PIL.Image.Image]` with surface normals visualization.
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
        """
        flip_vec = None
        if any((flip_x, flip_y, flip_z)):
@@ -546,99 +528,6 @@ class MarigoldImageProcessor(ConfigMixin):
        else:
            raise ValueError(f"Unexpected input type: {type(normals)}")

-    @staticmethod
-    def visualize_intrinsics(
-        prediction: Union[
-            np.ndarray,
-            torch.Tensor,
-            List[np.ndarray],
-            List[torch.Tensor],
-        ],
-        target_properties: Dict[str, Any],
-        color_map: Union[str, Dict[str, str]] = "binary",
-    ) -> List[Dict[str, PIL.Image.Image]]:
-        """
-        Visualizes intrinsic image decomposition, such as predictions of the `MarigoldIntrinsicsPipeline`.
-
-        Args:
-            prediction (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
-                Intrinsic image decomposition.
-            target_properties (`Dict[str, Any]`):
-                Decomposition properties. Expected entries: `target_names: List[str]` and a dictionary with keys
-                `prediction_space: str`, `sub_target_names: List[Union[str, Null]]` (must have 3 entries, null for
-                missing modalities), `up_to_scale: bool`, one for each target and sub-target.
-            color_map (`Union[str, Dict[str, str]]`, *optional*, defaults to `"Spectral"`):
-                Color map used to convert a single-channel predictions into colored representations. When a dictionary
-                is passed, each modality can be colored with its own color map.
-
-        Returns: `List[Dict[str, PIL.Image.Image]]` with intrinsic image decomposition visualization.
-        """
-        if "target_names" not in target_properties:
-            raise ValueError("Missing `target_names` in target_properties")
-        if not isinstance(color_map, str) and not (
-            isinstance(color_map, dict)
-            and all(isinstance(k, str) and isinstance(v, str) for k, v in color_map.items())
-        ):
-            raise ValueError("`color_map` must be a string or a dictionary of strings")
-        n_targets = len(target_properties["target_names"])
-
-        def visualize_targets_one(images, idx=None):
-            # img: [T, 3, H, W]
-            out = {}
-            for target_name, img in zip(target_properties["target_names"], images):
-                img = img.permute(1, 2, 0)  # [H, W, 3]
-                prediction_space = target_properties[target_name].get("prediction_space", "srgb")
-                if prediction_space == "stack":
-                    sub_target_names = target_properties[target_name]["sub_target_names"]
-                    if len(sub_target_names) != 3 or any(
-                        not (isinstance(s, str) or s is None) for s in sub_target_names
-                    ):
-                        raise ValueError(f"Unexpected target sub-names {sub_target_names} in {target_name}")
-                    for i, sub_target_name in enumerate(sub_target_names):
-                        if sub_target_name is None:
-                            continue
-                        sub_img = img[:, :, i]
-                        sub_prediction_space = target_properties[sub_target_name].get("prediction_space", "srgb")
-                        if sub_prediction_space == "linear":
-                            sub_up_to_scale = target_properties[sub_target_name].get("up_to_scale", False)
-                            if sub_up_to_scale:
-                                sub_img = sub_img / max(sub_img.max().item(), 1e-6)
-                            sub_img = sub_img ** (1 / 2.2)
-                        cmap_name = (
-                            color_map if isinstance(color_map, str) else color_map.get(sub_target_name, "binary")
-                        )
-                        sub_img = MarigoldImageProcessor.colormap(sub_img, cmap=cmap_name, bytes=True)
-                        sub_img = PIL.Image.fromarray(sub_img.cpu().numpy())
-                        out[sub_target_name] = sub_img
-                elif prediction_space == "linear":
-                    up_to_scale = target_properties[target_name].get("up_to_scale", False)
-                    if up_to_scale:
-                        img = img / max(img.max().item(), 1e-6)
-                    img = img ** (1 / 2.2)
-                elif prediction_space == "srgb":
-                    pass
-                img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
-                img = PIL.Image.fromarray(img)
-                out[target_name] = img
-            return out
-
-        if prediction is None or isinstance(prediction, list) and any(o is None for o in prediction):
-            raise ValueError("Input prediction is `None`")
-        if isinstance(prediction, (np.ndarray, torch.Tensor)):
-            prediction = MarigoldImageProcessor.expand_tensor_or_array(prediction)
-            if isinstance(prediction, np.ndarray):
-                prediction = MarigoldImageProcessor.numpy_to_pt(prediction)  # [N*T,3,H,W]
-            if not (prediction.ndim == 4 and prediction.shape[1] == 3 and prediction.shape[0] % n_targets == 0):
-                raise ValueError(f"Unexpected input shape={prediction.shape}, expecting [N*T,3,H,W].")
-            N_T, _, H, W = prediction.shape
-            N = N_T // n_targets
-            prediction = prediction.reshape(N, n_targets, 3, H, W)
-            return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
-        elif isinstance(prediction, list):
-            return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
-        else:
-            raise ValueError(f"Unexpected input type: {type(prediction)}")
-
    @staticmethod
    def visualize_uncertainty(
        uncertainty: Union[
@@ -648,10 +537,9 @@ class MarigoldImageProcessor(ConfigMixin):
            List[torch.Tensor],
        ],
        saturation_percentile=95,
-    ) -> List[PIL.Image.Image]:
+    ) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
        """
-        Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline`, `MarigoldNormalsPipeline`, or
-        `MarigoldIntrinsicsPipeline`.
+        Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.

        Args:
            uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
@@ -659,15 +547,14 @@ class MarigoldImageProcessor(ConfigMixin):
            saturation_percentile (`int`, *optional*, defaults to `95`):
                Specifies the percentile uncertainty value visualized with maximum intensity.

-        Returns: `List[PIL.Image.Image]` with uncertainty visualization.
+        Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
        """

        def visualize_uncertainty_one(img, idx=None):
            prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
            if img.min() < 0:
-                raise ValueError(f"{prefix}: unexpected data range, min={img.min()}.")
-            img = img.permute(1, 2, 0)  # [H,W,C]
-            img = img.squeeze(2).cpu().numpy()  # [H,W] or [H,W,3]
+                raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
+            img = img.squeeze(0).cpu().numpy()
            saturation_value = np.percentile(img, saturation_percentile)
            img = np.clip(img * 255 / saturation_value, 0, 255)
            img = img.astype(np.uint8)
@@ -679,9 +566,9 @@ class MarigoldImageProcessor(ConfigMixin):
        if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
            uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
            if isinstance(uncertainty, np.ndarray):
-                uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty)  # [N,C,H,W]
-            if not (uncertainty.ndim == 4 and uncertainty.shape[1] in (1, 3)):
-                raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,C,H,W] with C in (1,3).")
+                uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty)  # [N,1,H,W]
+            if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
+                raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
        elif isinstance(uncertainty, list):
            return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
@@ -1,5 +1,5 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 # More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
+# Marigold project website: https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
 from dataclasses import dataclass
 from functools import partial
@@ -64,7 +64,7 @@ Examples:
 >>> import torch

 >>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
-...     "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
+...     "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
 ... ).to("cuda")

 >>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
@@ -86,12 +86,11 @@ class MarigoldDepthOutput(BaseOutput):

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            \times 1 \times height \times width$.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
@@ -209,11 +208,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
        output_type: str,
        output_uncertainty: bool,
    ) -> int:
-        actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        if actual_vae_scale_factor != self.vae_scale_factor:
-            raise ValueError(
-                f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
-            )
        if num_inference_steps is None:
            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
        if num_inference_steps < 1:
@@ -326,7 +320,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):

        return num_images

-    @torch.compiler.disable
    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
        if not hasattr(self, "_progress_bar_config"):
            self._progress_bar_config = {}
@@ -377,9 +370,11 @@ class MarigoldDepthPipeline(DiffusionPipeline):
                same width and height.
            num_inference_steps (`int`, *optional*, defaults to `None`):
                Number of denoising diffusion steps during inference. The default value `None` results in automatic
-                selection.
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
            ensemble_size (`int`, defaults to `1`):
-                Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
            processing_resolution (`int`, *optional*, defaults to `None`):
                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
                produces crisper predictions, but may also lead to the overall loss of global context. The default
@@ -491,7 +486,9 @@ class MarigoldDepthPipeline(DiffusionPipeline):
        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
-        # code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
        # Model invocation: self.vae.encoder.
        image_latent, pred_latent = self.prepare_latents(
@@ -736,7 +733,6 @@ class MarigoldDepthPipeline(DiffusionPipeline):
                param = init_s.cpu().numpy()
            else:
                raise ValueError("Unrecognized alignment.")
-            param = param.astype(np.float64)

            return param

@@ -779,7 +775,7 @@ class MarigoldDepthPipeline(DiffusionPipeline):

            if regularizer_strength > 0:
                prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
-                err_near = prediction.min().abs().item()
+                err_near = (0.0 - prediction.min()).abs().item()
                err_far = (1.0 - prediction.max()).abs().item()
                cost += (err_near + err_far) * regularizer_strength

@@ -1,721 +0,0 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
-# --------------------------------------------------------------------------
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from PIL import Image
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from ...image_processor import PipelineImageInput
-from ...models import (
-    AutoencoderKL,
-    UNet2DConditionModel,
-)
-from ...schedulers import (
-    DDIMScheduler,
-    LCMScheduler,
-)
-from ...utils import (
-    BaseOutput,
-    is_torch_xla_available,
-    logging,
-    replace_example_docstring,
-)
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from .marigold_image_processing import MarigoldImageProcessor
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-Examples:
-```py
->>> import diffusers
->>> import torch
-
->>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
-...     "prs-eth/marigold-iid-appearance-v1-1", variant="fp16", torch_dtype=torch.float16
-... ).to("cuda")
-
->>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
->>> intrinsics = pipe(image)
-
->>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
->>> vis[0]["albedo"].save("einstein_albedo.png")
->>> vis[0]["roughness"].save("einstein_roughness.png")
->>> vis[0]["metallicity"].save("einstein_metallicity.png")
-```
-```py
->>> import diffusers
->>> import torch
-
->>> pipe = diffusers.MarigoldIntrinsicsPipeline.from_pretrained(
-...     "prs-eth/marigold-iid-lighting-v1-1", variant="fp16", torch_dtype=torch.float16
-... ).to("cuda")
-
->>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
->>> intrinsics = pipe(image)
-
->>> vis = pipe.image_processor.visualize_intrinsics(intrinsics.prediction, pipe.target_properties)
->>> vis[0]["albedo"].save("einstein_albedo.png")
->>> vis[0]["shading"].save("einstein_shading.png")
->>> vis[0]["residual"].save("einstein_residual.png")
-```
-"""
-
-
-@dataclass
-class MarigoldIntrinsicsOutput(BaseOutput):
-    """
-    Output class for Marigold Intrinsic Image Decomposition pipeline.
-
-    Args:
-        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
-            \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
-            \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
-            the intrinsic image decomposition.
-        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
-            numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
-            height \times width \times 3$ for `np.ndarray`.
-        latent (`None`, `torch.Tensor`):
-            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
-    """
-
-    prediction: Union[np.ndarray, torch.Tensor]
-    uncertainty: Union[None, np.ndarray, torch.Tensor]
-    latent: Union[None, torch.Tensor]
-
-
-class MarigoldIntrinsicsPipeline(DiffusionPipeline):
-    """
-    Pipeline for Intrinsic Image Decomposition (IID) using the Marigold method:
-    https://marigoldcomputervision.github.io.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        unet (`UNet2DConditionModel`):
-            Conditional U-Net to denoise the targets latent, conditioned on image latent.
-        vae (`AutoencoderKL`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images and predictions to and from latent
-            representations.
-        scheduler (`DDIMScheduler` or `LCMScheduler`):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        text_encoder (`CLIPTextModel`):
-            Text-encoder, for empty text embedding.
-        tokenizer (`CLIPTokenizer`):
-            CLIP tokenizer.
-        prediction_type (`str`, *optional*):
-            Type of predictions made by the model.
-        target_properties (`Dict[str, Any]`, *optional*):
-            Properties of the predicted modalities, such as `target_names`, a `List[str]` used to define the number,
-            order and names of the predicted modalities, and any other metadata that may be required to interpret the
-            predictions.
-        default_denoising_steps (`int`, *optional*):
-            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
-            quality with the given model. This value must be set in the model config. When the pipeline is called
-            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
-            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
-            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
-        default_processing_resolution (`int`, *optional*):
-            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
-            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
-            default value is used. This is required to ensure reasonable results with various model flavors trained
-            with varying optimal processing resolution values.
-    """
-
-    model_cpu_offload_seq = "text_encoder->unet->vae"
-    supported_prediction_types = ("intrinsics",)
-
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: Union[DDIMScheduler, LCMScheduler],
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        prediction_type: Optional[str] = None,
-        target_properties: Optional[Dict[str, Any]] = None,
-        default_denoising_steps: Optional[int] = None,
-        default_processing_resolution: Optional[int] = None,
-    ):
-        super().__init__()
-
-        if prediction_type not in self.supported_prediction_types:
-            logger.warning(
-                f"Potentially unsupported `prediction_type='{prediction_type}'`; values supported by the pipeline: "
-                f"{self.supported_prediction_types}."
-            )
-
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.register_to_config(
-            prediction_type=prediction_type,
-            target_properties=target_properties,
-            default_denoising_steps=default_denoising_steps,
-            default_processing_resolution=default_processing_resolution,
-        )
-
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
-
-        self.target_properties = target_properties
-        self.default_denoising_steps = default_denoising_steps
-        self.default_processing_resolution = default_processing_resolution
-
-        self.empty_text_embedding = None
-
-        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    @property
-    def n_targets(self):
-        return self.unet.config.out_channels // self.vae.config.latent_channels
-
-    def check_inputs(
-        self,
-        image: PipelineImageInput,
-        num_inference_steps: int,
-        ensemble_size: int,
-        processing_resolution: int,
-        resample_method_input: str,
-        resample_method_output: str,
-        batch_size: int,
-        ensembling_kwargs: Optional[Dict[str, Any]],
-        latents: Optional[torch.Tensor],
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
-        output_type: str,
-        output_uncertainty: bool,
-    ) -> int:
-        actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        if actual_vae_scale_factor != self.vae_scale_factor:
-            raise ValueError(
-                f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
-            )
-        if num_inference_steps is None:
-            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
-        if num_inference_steps < 1:
-            raise ValueError("`num_inference_steps` must be positive.")
-        if ensemble_size < 1:
-            raise ValueError("`ensemble_size` must be positive.")
-        if ensemble_size == 2:
-            logger.warning(
-                "`ensemble_size` == 2 results are similar to no ensembling (1); "
-                "consider increasing the value to at least 3."
-            )
-        if ensemble_size == 1 and output_uncertainty:
-            raise ValueError(
-                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
-                "greater than 1."
-            )
-        if processing_resolution is None:
-            raise ValueError(
-                "`processing_resolution` is not specified and could not be resolved from the model config."
-            )
-        if processing_resolution < 0:
-            raise ValueError(
-                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
-                "downsampled processing."
-            )
-        if processing_resolution % self.vae_scale_factor != 0:
-            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
-        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
-            raise ValueError(
-                "`resample_method_input` takes string values compatible with PIL library: "
-                "nearest, nearest-exact, bilinear, bicubic, area."
-            )
-        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
-            raise ValueError(
-                "`resample_method_output` takes string values compatible with PIL library: "
-                "nearest, nearest-exact, bilinear, bicubic, area."
-            )
-        if batch_size < 1:
-            raise ValueError("`batch_size` must be positive.")
-        if output_type not in ["pt", "np"]:
-            raise ValueError("`output_type` must be one of `pt` or `np`.")
-        if latents is not None and generator is not None:
-            raise ValueError("`latents` and `generator` cannot be used together.")
-        if ensembling_kwargs is not None:
-            if not isinstance(ensembling_kwargs, dict):
-                raise ValueError("`ensembling_kwargs` must be a dictionary.")
-            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("median", "mean"):
-                raise ValueError("`ensembling_kwargs['reduction']` can be either `'median'` or `'mean'`.")
-
-        # image checks
-        num_images = 0
-        W, H = None, None
-        if not isinstance(image, list):
-            image = [image]
-        for i, img in enumerate(image):
-            if isinstance(img, np.ndarray) or torch.is_tensor(img):
-                if img.ndim not in (2, 3, 4):
-                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
-                H_i, W_i = img.shape[-2:]
-                N_i = 1
-                if img.ndim == 4:
-                    N_i = img.shape[0]
-            elif isinstance(img, Image.Image):
-                W_i, H_i = img.size
-                N_i = 1
-            else:
-                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
-            if W is None:
-                W, H = W_i, H_i
-            elif (W, H) != (W_i, H_i):
-                raise ValueError(
-                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
-                )
-            num_images += N_i
-
-        # latents checks
-        if latents is not None:
-            if not torch.is_tensor(latents):
-                raise ValueError("`latents` must be a torch.Tensor.")
-            if latents.dim() != 4:
-                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
-
-            if processing_resolution > 0:
-                max_orig = max(H, W)
-                new_H = H * processing_resolution // max_orig
-                new_W = W * processing_resolution // max_orig
-                if new_H == 0 or new_W == 0:
-                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
-                W, H = new_W, new_H
-            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
-            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
-            shape_expected = (num_images * ensemble_size, self.unet.config.out_channels, h, w)
-
-            if latents.shape != shape_expected:
-                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
-
-        # generator checks
-        if generator is not None:
-            if isinstance(generator, list):
-                if len(generator) != num_images * ensemble_size:
-                    raise ValueError(
-                        "The number of generators must match the total number of ensemble members for all input images."
-                    )
-                if not all(g.device.type == generator[0].device.type for g in generator):
-                    raise ValueError("`generator` device placement is not consistent in the list.")
-            elif not isinstance(generator, torch.Generator):
-                raise ValueError(f"Unsupported generator type: {type(generator)}.")
-
-        return num_images
-
-    @torch.compiler.disable
-    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
-        if not hasattr(self, "_progress_bar_config"):
-            self._progress_bar_config = {}
-        elif not isinstance(self._progress_bar_config, dict):
-            raise ValueError(
-                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
-            )
-
-        progress_bar_config = dict(**self._progress_bar_config)
-        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
-        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
-        if iterable is not None:
-            return tqdm(iterable, **progress_bar_config)
-        elif total is not None:
-            return tqdm(total=total, **progress_bar_config)
-        else:
-            raise ValueError("Either `total` or `iterable` has to be defined.")
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: PipelineImageInput,
-        num_inference_steps: Optional[int] = None,
-        ensemble_size: int = 1,
-        processing_resolution: Optional[int] = None,
-        match_input_resolution: bool = True,
-        resample_method_input: str = "bilinear",
-        resample_method_output: str = "bilinear",
-        batch_size: int = 1,
-        ensembling_kwargs: Optional[Dict[str, Any]] = None,
-        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        output_type: str = "np",
-        output_uncertainty: bool = False,
-        output_latent: bool = False,
-        return_dict: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline.
-
-        Args:
-            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
-                `List[torch.Tensor]`: An input image or images used as an input for the intrinsic decomposition task.
-                For arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is
-                possible by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
-                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
-                same width and height.
-            num_inference_steps (`int`, *optional*, defaults to `None`):
-                Number of denoising diffusion steps during inference. The default value `None` results in automatic
-                selection.
-            ensemble_size (`int`, defaults to `1`):
-                Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
-            processing_resolution (`int`, *optional*, defaults to `None`):
-                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
-                produces crisper predictions, but may also lead to the overall loss of global context. The default
-                value `None` resolves to the optimal value from the model config.
-            match_input_resolution (`bool`, *optional*, defaults to `True`):
-                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
-                side of the output will equal to `processing_resolution`.
-            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
-                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
-                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
-            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
-                Resampling method used to resize output predictions to match the input resolution. The accepted values
-                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
-            batch_size (`int`, *optional*, defaults to `1`):
-                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
-            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
-                Extra dictionary with arguments for precise ensembling control. The following options are available:
-                - reduction (`str`, *optional*, defaults to `"median"`): Defines the ensembling function applied in
-                  every pixel location, can be either `"median"` or `"mean"`.
-            latents (`torch.Tensor`, *optional*, defaults to `None`):
-                Latent noise tensors to replace the random initialization. These can be taken from the previous
-                function call's output.
-            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
-                Random number generator object to ensure reproducibility.
-            output_type (`str`, *optional*, defaults to `"np"`):
-                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
-                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
-            output_uncertainty (`bool`, *optional*, defaults to `False`):
-                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
-                the `ensemble_size` argument is set to a value above 2.
-            output_latent (`bool`, *optional*, defaults to `False`):
-                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
-                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
-                `latents` argument.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.marigold.MarigoldIntrinsicsOutput`] instead of a plain tuple.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.marigold.MarigoldIntrinsicsOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldIntrinsicsOutput`] is returned, otherwise a
-                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
-                (or `None`), and the third is the latent (or `None`).
-        """
-
-        # 0. Resolving variables.
-        device = self._execution_device
-        dtype = self.dtype
-
-        # Model-specific optimal default values leading to fast and reasonable results.
-        if num_inference_steps is None:
-            num_inference_steps = self.default_denoising_steps
-        if processing_resolution is None:
-            processing_resolution = self.default_processing_resolution
-
-        # 1. Check inputs.
-        num_images = self.check_inputs(
-            image,
-            num_inference_steps,
-            ensemble_size,
-            processing_resolution,
-            resample_method_input,
-            resample_method_output,
-            batch_size,
-            ensembling_kwargs,
-            latents,
-            generator,
-            output_type,
-            output_uncertainty,
-        )
-
-        # 2. Prepare empty text conditioning.
-        # Model invocation: self.tokenizer, self.text_encoder.
-        if self.empty_text_embedding is None:
-            prompt = ""
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="do_not_pad",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids.to(device)
-            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
-
-        # 3. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
-        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
-        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
-        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
-        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
-        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
-        # resolution can lead to loss of either fine details or global context in the output predictions.
-        image, padding, original_resolution = self.image_processor.preprocess(
-            image, processing_resolution, resample_method_input, device, dtype
-        )  # [N,3,PPH,PPW]
-
-        # 4. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
-        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
-        # Latents of each such predictions across all input images and all ensemble members are represented in the
-        # `pred_latent` variable. The variable `image_latent` contains each input image encoded into latent space and
-        # replicated `E` times. The variable `pred_latent` contains latents initialization, where the latent space is
-        # replicated `T` times relative to the single latent space of `image_latent`, where `T` is the number of the
-        # predicted targets. The latents can be either generated (see `generator` to ensure reproducibility), or passed
-        # explicitly via the `latents` argument. The latter can be set outside the pipeline code. This behavior can be
-        # achieved by setting the `output_latent` argument to `True`. The latent space dimensions are `(h, w)`. Encoding
-        # into latent space happens in batches of size `batch_size`.
-        # Model invocation: self.vae.encoder.
-        image_latent, pred_latent = self.prepare_latents(
-            image, latents, generator, ensemble_size, batch_size
-        )  # [N*E,4,h,w], [N*E,T*4,h,w]
-
-        del image
-
-        batch_empty_text_embedding = self.empty_text_embedding.to(device=device, dtype=dtype).repeat(
-            batch_size, 1, 1
-        )  # [B,1024,2]
-
-        # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
-        # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
-        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
-        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
-        # model.
-        # Model invocation: self.unet.
-        pred_latents = []
-
-        for i in self.progress_bar(
-            range(0, num_images * ensemble_size, batch_size), leave=True, desc="Marigold predictions..."
-        ):
-            batch_image_latent = image_latent[i : i + batch_size]  # [B,4,h,w]
-            batch_pred_latent = pred_latent[i : i + batch_size]  # [B,T*4,h,w]
-            effective_batch_size = batch_image_latent.shape[0]
-            text = batch_empty_text_embedding[:effective_batch_size]  # [B,2,1024]
-
-            self.scheduler.set_timesteps(num_inference_steps, device=device)
-            for t in self.progress_bar(self.scheduler.timesteps, leave=False, desc="Diffusion steps..."):
-                batch_latent = torch.cat([batch_image_latent, batch_pred_latent], dim=1)  # [B,(1+T)*4,h,w]
-                noise = self.unet(batch_latent, t, encoder_hidden_states=text, return_dict=False)[0]  # [B,T*4,h,w]
-                batch_pred_latent = self.scheduler.step(
-                    noise, t, batch_pred_latent, generator=generator
-                ).prev_sample  # [B,T*4,h,w]
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-            pred_latents.append(batch_pred_latent)
-
-        pred_latent = torch.cat(pred_latents, dim=0)  # [N*E,T*4,h,w]
-
-        del (
-            pred_latents,
-            image_latent,
-            batch_empty_text_embedding,
-            batch_image_latent,
-            batch_pred_latent,
-            text,
-            batch_latent,
-            noise,
-        )
-
-        # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
-        # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
-        # Model invocation: self.vae.decoder.
-        pred_latent_for_decoding = pred_latent.reshape(
-            num_images * ensemble_size * self.n_targets, self.vae.config.latent_channels, *pred_latent.shape[2:]
-        )  # [N*E*T,4,PPH,PPW]
-        prediction = torch.cat(
-            [
-                self.decode_prediction(pred_latent_for_decoding[i : i + batch_size])
-                for i in range(0, pred_latent_for_decoding.shape[0], batch_size)
-            ],
-            dim=0,
-        )  # [N*E*T,3,PPH,PPW]
-
-        del pred_latent_for_decoding
-        if not output_latent:
-            pred_latent = None
-
-        # 7. Remove padding. The output shape is (PH, PW).
-        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E*T,3,PH,PW]
-
-        # 8. Ensemble and compute uncertainty (when `output_uncertainty` is set). This code treats each of the `N*T`
-        # groups of `E` ensemble predictions independently. For each group it computes an ensembled prediction of shape
-        # `(PH, PW)` and an optional uncertainty map of the same dimensions. After computing this pair of outputs for
-        # each group independently, it stacks them respectively into batches of `N*T` almost final predictions and
-        # uncertainty maps.
-        uncertainty = None
-        if ensemble_size > 1:
-            prediction = prediction.reshape(
-                num_images, ensemble_size, self.n_targets, *prediction.shape[1:]
-            )  # [N,E,T,3,PH,PW]
-            prediction = [
-                self.ensemble_intrinsics(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
-                for i in range(num_images)
-            ]  # [ [[T,3,PH,PW], [T,3,PH,PW]], ... ]
-            prediction, uncertainty = zip(*prediction)  # [[T,3,PH,PW], ... ], [[T,3,PH,PW], ... ]
-            prediction = torch.cat(prediction, dim=0)  # [N*T,3,PH,PW]
-            if output_uncertainty:
-                uncertainty = torch.cat(uncertainty, dim=0)  # [N*T,3,PH,PW]
-            else:
-                uncertainty = None
-
-        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
-        # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
-        # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
-        # setting the `resample_method_output` parameter (e.g., to `"nearest"`).
-        if match_input_resolution:
-            prediction = self.image_processor.resize_antialias(
-                prediction, original_resolution, resample_method_output, is_aa=False
-            )  # [N*T,3,H,W]
-            if uncertainty is not None and output_uncertainty:
-                uncertainty = self.image_processor.resize_antialias(
-                    uncertainty, original_resolution, resample_method_output, is_aa=False
-                )  # [N*T,1,H,W]
-
-        # 10. Prepare the final outputs.
-        if output_type == "np":
-            prediction = self.image_processor.pt_to_numpy(prediction)  # [N*T,H,W,3]
-            if uncertainty is not None and output_uncertainty:
-                uncertainty = self.image_processor.pt_to_numpy(uncertainty)  # [N*T,H,W,3]
-
-        # 11. Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (prediction, uncertainty, pred_latent)
-
-        return MarigoldIntrinsicsOutput(
-            prediction=prediction,
-            uncertainty=uncertainty,
-            latent=pred_latent,
-        )
-
-    def prepare_latents(
-        self,
-        image: torch.Tensor,
-        latents: Optional[torch.Tensor],
-        generator: Optional[torch.Generator],
-        ensemble_size: int,
-        batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        def retrieve_latents(encoder_output):
-            if hasattr(encoder_output, "latent_dist"):
-                return encoder_output.latent_dist.mode()
-            elif hasattr(encoder_output, "latents"):
-                return encoder_output.latents
-            else:
-                raise AttributeError("Could not access latents of provided encoder_output")
-
-        image_latent = torch.cat(
-            [
-                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
-                for i in range(0, image.shape[0], batch_size)
-            ],
-            dim=0,
-        )  # [N,4,h,w]
-        image_latent = image_latent * self.vae.config.scaling_factor
-        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
-        N_E, C, H, W = image_latent.shape
-
-        pred_latent = latents
-        if pred_latent is None:
-            pred_latent = randn_tensor(
-                (N_E, self.n_targets * C, H, W),
-                generator=generator,
-                device=image_latent.device,
-                dtype=image_latent.dtype,
-            )  # [N*E,T*4,h,w]
-
-        return image_latent, pred_latent
-
-    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
-        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
-            raise ValueError(
-                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
-            )
-
-        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
-
-        prediction = torch.clip(prediction, -1.0, 1.0)  # [B,3,H,W]
-        prediction = (prediction + 1.0) / 2.0
-
-        return prediction  # [B,3,H,W]
-
-    @staticmethod
-    def ensemble_intrinsics(
-        targets: torch.Tensor,
-        output_uncertainty: bool = False,
-        reduction: str = "median",
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Ensembles the intrinsic decomposition represented by the `targets` tensor with expected shape `(B, T, 3, H,
-        W)`, where B is the number of ensemble members for a given prediction of size `(H x W)`, and T is the number of
-        predicted targets.
-
-        Args:
-            targets (`torch.Tensor`):
-                Input ensemble of intrinsic image decomposition maps.
-            output_uncertainty (`bool`, *optional*, defaults to `False`):
-                Whether to output uncertainty map.
-            reduction (`str`, *optional*, defaults to `"mean"`):
-                Reduction method used to ensemble aligned predictions. The accepted values are: `"median"` and
-                `"mean"`.
-
-        Returns:
-            A tensor of aligned and ensembled intrinsic decomposition maps with shape `(T, 3, H, W)` and optionally a
-            tensor of uncertainties of shape `(T, 3, H, W)`.
-        """
-        if targets.dim() != 5 or targets.shape[2] != 3:
-            raise ValueError(f"Expecting 4D tensor of shape [B,T,3,H,W]; got {targets.shape}.")
-        if reduction not in ("median", "mean"):
-            raise ValueError(f"Unrecognized reduction method: {reduction}.")
-
-        B, T, _, H, W = targets.shape
-        uncertainty = None
-        if reduction == "mean":
-            prediction = torch.mean(targets, dim=0)  # [T,3,H,W]
-            if output_uncertainty:
-                uncertainty = torch.std(targets, dim=0)  # [T,3,H,W]
-        elif reduction == "median":
-            prediction = torch.median(targets, dim=0, keepdim=True).values  # [1,T,3,H,W]
-            if output_uncertainty:
-                uncertainty = torch.abs(targets - prediction)  # [B,T,3,H,W]
-                uncertainty = torch.median(uncertainty, dim=0).values  # [T,3,H,W]
-            prediction = prediction.squeeze(0)  # [T,3,H,W]
-        else:
-            raise ValueError(f"Unrecognized reduction method: {reduction}.")
-        return prediction, uncertainty
@@ -1,5 +1,5 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 # More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
+# Marigold project website: https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -62,7 +62,7 @@ Examples:
 >>> import torch

 >>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
-...     "prs-eth/marigold-normals-v1-1", variant="fp16", torch_dtype=torch.float16
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
 ... ).to("cuda")

 >>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
@@ -81,12 +81,11 @@ class MarigoldNormalsOutput(BaseOutput):

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            \times 1 \times height \times width$.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
@@ -165,7 +164,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
            tokenizer=tokenizer,
        )
        self.register_to_config(
-            prediction_type=prediction_type,
            use_full_z_range=use_full_z_range,
            default_denoising_steps=default_denoising_steps,
            default_processing_resolution=default_processing_resolution,
@@ -196,11 +194,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
        output_type: str,
        output_uncertainty: bool,
    ) -> int:
-        actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        if actual_vae_scale_factor != self.vae_scale_factor:
-            raise ValueError(
-                f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
-            )
        if num_inference_steps is None:
            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
        if num_inference_steps < 1:
@@ -311,7 +304,6 @@ class MarigoldNormalsPipeline(DiffusionPipeline):

        return num_images

-    @torch.compiler.disable
    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
        if not hasattr(self, "_progress_bar_config"):
            self._progress_bar_config = {}
@@ -362,9 +354,11 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
                same width and height.
            num_inference_steps (`int`, *optional*, defaults to `None`):
                Number of denoising diffusion steps during inference. The default value `None` results in automatic
-                selection.
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
            ensemble_size (`int`, defaults to `1`):
-                Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
            processing_resolution (`int`, *optional*, defaults to `None`):
                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
                produces crisper predictions, but may also lead to the overall loss of global context. The default
@@ -400,7 +394,7 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
                `latents` argument.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.marigold.MarigoldNormalsOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.

        Examples:

@@ -468,7 +462,9 @@ class MarigoldNormalsPipeline(DiffusionPipeline):
        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
-        # code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
        # Model invocation: self.vae.encoder.
        image_latent, pred_latent = self.prepare_latents(
@@ -20,7 +20,7 @@ import warnings
 from typing import Callable, Dict, List, Optional, Tuple, Union

 import torch
-from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+from transformers import AutoModelForCausalLM, AutoTokenizer

 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PixArtImageProcessor
@@ -160,8 +160,8 @@ class SanaPAGPipeline(DiffusionPipeline, PAGMixin):

    def __init__(
        self,
-        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
-        text_encoder: Gemma2PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        text_encoder: AutoModelForCausalLM,
        vae: AutoencoderDC,
        transformer: SanaTransformer2DModel,
        scheduler: FlowMatchEulerDiscreteScheduler,
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import enum
 import fnmatch
 import importlib
 import inspect
@@ -54,8 +55,6 @@ from ..utils import (
    DEPRECATED_REVISION_ARGS,
    BaseOutput,
    PushToHubMixin,
-    _get_detailed_type,
-    _is_valid_type,
    is_accelerate_available,
    is_accelerate_version,
    is_torch_npu_available,
@@ -685,7 +684,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)
        from_flax = kwargs.pop("from_flax", False)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)
        custom_pipeline = kwargs.pop("custom_pipeline", None)
        custom_revision = kwargs.pop("custom_revision", None)
        provider = kwargs.pop("provider", None)
@@ -702,12 +701,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        use_onnx = kwargs.pop("use_onnx", None)
        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)

-        if not isinstance(torch_dtype, torch.dtype):
-            torch_dtype = torch.float32
-            logger.warning(
-                f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
-            )
-
        if low_cpu_mem_usage and not is_accelerate_available():
            low_cpu_mem_usage = False
            logger.warning(
@@ -883,6 +876,26 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):

        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}

+        for key in init_dict.keys():
+            if key not in passed_class_obj:
+                continue
+            if "scheduler" in key:
+                continue
+
+            class_obj = passed_class_obj[key]
+            _expected_class_types = []
+            for expected_type in expected_types[key]:
+                if isinstance(expected_type, enum.EnumMeta):
+                    _expected_class_types.extend(expected_type.__members__.keys())
+                else:
+                    _expected_class_types.append(expected_type.__name__)
+
+            _is_valid_type = class_obj.__class__.__name__ in _expected_class_types
+            if not _is_valid_type:
+                logger.warning(
+                    f"Expected types for {key}: {_expected_class_types}, got {class_obj.__class__.__name__}."
+                )
+
        # Special case: safety_checker must be loaded separately when using `from_flax`
        if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj:
            raise NotImplementedError(
@@ -1002,26 +1015,10 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
            )

-        # 10. Type checking init arguments
-        for kw, arg in init_kwargs.items():
-            # Too complex to validate with type annotation alone
-            if "scheduler" in kw:
-                continue
-            # Many tokenizer annotations don't include its "Fast" variant, so skip this
-            # e.g T5Tokenizer but not T5TokenizerFast
-            elif "tokenizer" in kw:
-                continue
-            elif (
-                arg is not None  # Skip if None
-                and not expected_types[kw] == (inspect.Signature.empty,)  # Skip if no type annotations
-                and not _is_valid_type(arg, expected_types[kw])  # Check type
-            ):
-                logger.warning(f"Expected types for {kw}: {expected_types[kw]}, got {_get_detailed_type(arg)}.")
-
-        # 11. Instantiate the pipeline
+        # 10. Instantiate the pipeline
        model = pipeline_class(**init_kwargs)

-        # 12. Save where the model was instantiated from
+        # 11. Save where the model was instantiated from
        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
        if device_map is not None:
            setattr(model, "hf_device_map", final_device_map)
@@ -1832,7 +1829,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """

        original_config = dict(pipeline.config)
-        torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        torch_dtype = kwargs.pop("torch_dtype", None)

        # derive the pipeline class to instantiate
        custom_pipeline = kwargs.pop("custom_pipeline", None)
@@ -20,7 +20,7 @@ import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
-from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+from transformers import AutoModelForCausalLM, AutoTokenizer

 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PixArtImageProcessor
@@ -200,8 +200,8 @@ class SanaPipeline(DiffusionPipeline, SanaLoraLoaderMixin):

    def __init__(
        self,
-        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
-        text_encoder: Gemma2PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        text_encoder: AutoModelForCausalLM,
        vae: AutoencoderDC,
        transformer: SanaTransformer2DModel,
        scheduler: DPMSolverMultistepScheduler,
@@ -15,7 +15,7 @@
 from typing import Callable, Dict, List, Optional, Union

 import torch
-from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextModel, CLIPTokenizer

 from ...models import StableCascadeUNet
 from ...schedulers import DDPMWuerstchenScheduler
@@ -65,7 +65,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
    Args:
        tokenizer (`CLIPTokenizer`):
            The CLIP tokenizer.
-        text_encoder (`CLIPTextModelWithProjection`):
+        text_encoder (`CLIPTextModel`):
            The CLIP text encoder.
        decoder ([`StableCascadeUNet`]):
            The Stable Cascade decoder unet.
@@ -93,7 +93,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
        self,
        decoder: StableCascadeUNet,
        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
+        text_encoder: CLIPTextModel,
        scheduler: DDPMWuerstchenScheduler,
        vqgan: PaellaVQModel,
        latent_dim_scale: float = 10.67,
@@ -15,7 +15,7 @@ from typing import Callable, Dict, List, Optional, Union

 import PIL
 import torch
-from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from ...models import StableCascadeUNet
 from ...schedulers import DDPMWuerstchenScheduler
@@ -52,7 +52,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
    Args:
        tokenizer (`CLIPTokenizer`):
            The decoder tokenizer to be used for text inputs.
-        text_encoder (`CLIPTextModelWithProjection`):
+        text_encoder (`CLIPTextModel`):
            The decoder text encoder to be used for text inputs.
        decoder (`StableCascadeUNet`):
            The decoder model to be used for decoder image generation pipeline.
@@ -60,18 +60,14 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
            The scheduler to be used for decoder image generation pipeline.
        vqgan (`PaellaVQModel`):
            The VQGAN model to be used for decoder image generation pipeline.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        prior_prior (`StableCascadeUNet`):
            The prior model to be used for prior pipeline.
-        prior_text_encoder (`CLIPTextModelWithProjection`):
-            The prior text encoder to be used for text inputs.
-        prior_tokenizer (`CLIPTokenizer`):
-            The prior tokenizer to be used for text inputs.
        prior_scheduler (`DDPMWuerstchenScheduler`):
            The scheduler to be used for prior pipeline.
-        prior_feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
-        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
    """

    _load_connected_pipes = True
@@ -80,12 +76,12 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
    def __init__(
        self,
        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
+        text_encoder: CLIPTextModel,
        decoder: StableCascadeUNet,
        scheduler: DDPMWuerstchenScheduler,
        vqgan: PaellaVQModel,
        prior_prior: StableCascadeUNet,
-        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_text_encoder: CLIPTextModel,
        prior_tokenizer: CLIPTokenizer,
        prior_scheduler: DDPMWuerstchenScheduler,
        prior_feature_extractor: Optional[CLIPImageProcessor] = None,
@@ -141,7 +141,7 @@ class StableUnCLIPPipeline(
        image_noising_scheduler: KarrasDiffusionSchedulers,
        # regular denoising components
        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModel,
+        text_encoder: CLIPTextModelWithProjection,
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        # vae
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import torch
 from transformers import (
+    BaseImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
-    SiglipImageProcessor,
-    SiglipVisionModel,
+    PreTrainedModel,
    T5EncoderModel,
    T5TokenizerFast,
 )
@@ -176,9 +176,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
        tokenizer_3 (`T5TokenizerFast`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        image_encoder (`SiglipVisionModel`, *optional*):
+        image_encoder (`PreTrainedModel`, *optional*):
            Pre-trained Vision Model for IP Adapter.
-        feature_extractor (`SiglipImageProcessor`, *optional*):
+        feature_extractor (`BaseImageProcessor`, *optional*):
            Image processor for IP Adapter.
    """

@@ -197,8 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
        tokenizer_2: CLIPTokenizer,
        text_encoder_3: T5EncoderModel,
        tokenizer_3: T5TokenizerFast,
-        image_encoder: SiglipVisionModel = None,
-        feature_extractor: SiglipImageProcessor = None,
+        image_encoder: PreTrainedModel = None,
+        feature_extractor: BaseImageProcessor = None,
    ):
        super().__init__()

@@ -18,10 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import PIL.Image
 import torch
 from transformers import (
+    BaseImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
-    SiglipImageProcessor,
-    SiglipVisionModel,
+    PreTrainedModel,
    T5EncoderModel,
    T5TokenizerFast,
 )
@@ -197,10 +197,6 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        tokenizer_3 (`T5TokenizerFast`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        image_encoder (`SiglipVisionModel`, *optional*):
-            Pre-trained Vision Model for IP Adapter.
-        feature_extractor (`SiglipImageProcessor`, *optional*):
-            Image processor for IP Adapter.
    """

    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
@@ -218,8 +214,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        tokenizer_2: CLIPTokenizer,
        text_encoder_3: T5EncoderModel,
        tokenizer_3: T5TokenizerFast,
-        image_encoder: Optional[SiglipVisionModel] = None,
-        feature_extractor: Optional[SiglipImageProcessor] = None,
+        image_encoder: PreTrainedModel = None,
+        feature_extractor: BaseImageProcessor = None,
    ):
        super().__init__()

@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import torch
 from transformers import (
+    BaseImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
-    SiglipImageProcessor,
-    SiglipVisionModel,
+    PreTrainedModel,
    T5EncoderModel,
    T5TokenizerFast,
 )
@@ -196,9 +196,9 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        tokenizer_3 (`T5TokenizerFast`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        image_encoder (`SiglipVisionModel`, *optional*):
+        image_encoder (`PreTrainedModel`, *optional*):
            Pre-trained Vision Model for IP Adapter.
-        feature_extractor (`SiglipImageProcessor`, *optional*):
+        feature_extractor (`BaseImageProcessor`, *optional*):
            Image processor for IP Adapter.
    """

@@ -217,8 +217,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        tokenizer_2: CLIPTokenizer,
        text_encoder_3: T5EncoderModel,
        tokenizer_3: T5TokenizerFast,
-        image_encoder: Optional[SiglipVisionModel] = None,
-        feature_extractor: Optional[SiglipImageProcessor] = None,
+        image_encoder: PreTrainedModel = None,
+        feature_extractor: BaseImageProcessor = None,
    ):
        super().__init__()

@@ -19,31 +19,15 @@ from typing import Callable, List, Optional, Union
 import torch
 from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
 from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPTokenizerFast,
-)

 from ...image_processor import VaeImageProcessor
-from ...loaders import (
-    StableDiffusionLoraLoaderMixin,
-    TextualInversionLoaderMixin,
-)
-from ...models import AutoencoderKL, UNet2DConditionModel
+from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
-from ...utils import (
-    USE_PEFT_BACKEND,
-    deprecate,
-    logging,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
+from ...schedulers import LMSDiscreteScheduler
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
-from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from ..stable_diffusion import StableDiffusionPipelineOutput


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -111,13 +95,13 @@ class StableDiffusionKDiffusionPipeline(

    def __init__(
        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast],
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -123,7 +123,6 @@ from .state_dict_utils import (
    convert_state_dict_to_peft,
    convert_unet_state_dict_to_peft,
 )
-from .typing_utils import _get_detailed_type, _is_valid_type


 logger = get_logger(__name__)
@@ -1217,21 +1217,6 @@ class MarigoldDepthPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class MarigoldIntrinsicsPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class MarigoldNormalsPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -1532,21 +1517,6 @@ class StableCascadePriorPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class StableDiffusion3ControlNetInpaintingPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class StableDiffusion3ControlNetPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -1,91 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Typing utilities: Utilities related to type checking and validation
-"""
-
-from typing import Any, Dict, List, Set, Tuple, Type, Union, get_args, get_origin
-
-
-def _is_valid_type(obj: Any, class_or_tuple: Union[Type, Tuple[Type, ...]]) -> bool:
-    """
-    Checks if an object is an instance of any of the provided types. For collections, it checks if every element is of
-    the correct type as well.
-    """
-    if not isinstance(class_or_tuple, tuple):
-        class_or_tuple = (class_or_tuple,)
-
-    # Unpack unions
-    unpacked_class_or_tuple = []
-    for t in class_or_tuple:
-        if get_origin(t) is Union:
-            unpacked_class_or_tuple.extend(get_args(t))
-        else:
-            unpacked_class_or_tuple.append(t)
-    class_or_tuple = tuple(unpacked_class_or_tuple)
-
-    if Any in class_or_tuple:
-        return True
-
-    obj_type = type(obj)
-    # Classes with obj's type
-    class_or_tuple = {t for t in class_or_tuple if isinstance(obj, get_origin(t) or t)}
-
-    # Singular types (e.g. int, ControlNet, ...)
-    # Untyped collections (e.g. List, but not List[int])
-    elem_class_or_tuple = {get_args(t) for t in class_or_tuple}
-    if () in elem_class_or_tuple:
-        return True
-    # Typed lists or sets
-    elif obj_type in (list, set):
-        return any(all(_is_valid_type(x, t) for x in obj) for t in elem_class_or_tuple)
-    # Typed tuples
-    elif obj_type is tuple:
-        return any(
-            # Tuples with any length and single type (e.g. Tuple[int, ...])
-            (len(t) == 2 and t[-1] is Ellipsis and all(_is_valid_type(x, t[0]) for x in obj))
-            or
-            # Tuples with fixed length and any types (e.g. Tuple[int, str])
-            (len(obj) == len(t) and all(_is_valid_type(x, tt) for x, tt in zip(obj, t)))
-            for t in elem_class_or_tuple
-        )
-    # Typed dicts
-    elif obj_type is dict:
-        return any(
-            all(_is_valid_type(k, kt) and _is_valid_type(v, vt) for k, v in obj.items())
-            for kt, vt in elem_class_or_tuple
-        )
-
-    else:
-        return False
-
-
-def _get_detailed_type(obj: Any) -> Type:
-    """
-    Gets a detailed type for an object, including nested types for collections.
-    """
-    obj_type = type(obj)
-
-    if obj_type in (list, set):
-        obj_origin_type = List if obj_type is list else Set
-        elems_type = Union[tuple({_get_detailed_type(x) for x in obj})]
-        return obj_origin_type[elems_type]
-    elif obj_type is tuple:
-        return Tuple[tuple(_get_detailed_type(x) for x in obj)]
-    elif obj_type is dict:
-        keys_type = Union[tuple({_get_detailed_type(k) for k in obj.keys()})]
-        values_type = Union[tuple({_get_detailed_type(k) for k in obj.values()})]
-        return Dict[keys_type, values_type]
-    else:
-        return obj_type
@@ -18,7 +18,7 @@ from typing import Optional, Tuple, Union

 import torch

-from diffusers import DiffusionPipeline, ImagePipelineOutput, SchedulerMixin, UNet2DModel
+from diffusers import DiffusionPipeline, ImagePipelineOutput


 class CustomLocalPipeline(DiffusionPipeline):
@@ -33,7 +33,7 @@ class CustomLocalPipeline(DiffusionPipeline):
            [`DDPMScheduler`], or [`DDIMScheduler`].
    """

-    def __init__(self, unet: UNet2DModel, scheduler: SchedulerMixin):
+    def __init__(self, unet, scheduler):
        super().__init__()
        self.register_modules(unet=unet, scheduler=scheduler)

@@ -18,7 +18,6 @@ from typing import Optional, Tuple, Union

 import torch

-from diffusers import SchedulerMixin, UNet2DModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput


@@ -34,7 +33,7 @@ class CustomLocalPipeline(DiffusionPipeline):
            [`DDPMScheduler`], or [`DDIMScheduler`].
    """

-    def __init__(self, unet: UNet2DModel, scheduler: SchedulerMixin):
+    def __init__(self, unet, scheduler):
        super().__init__()
        self.register_modules(unet=unet, scheduler=scheduler)

@@ -15,8 +15,6 @@
 import sys
 import unittest

-import numpy as np
-import pytest
 import torch
 from transformers import AutoTokenizer, GemmaForCausalLM

@@ -26,12 +24,12 @@ from diffusers import (
    Lumina2Text2ImgPipeline,
    Lumina2Transformer2DModel,
 )
-from diffusers.utils.testing_utils import floats_tensor, is_torch_version, require_peft_backend, skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, require_peft_backend


 sys.path.append(".")

-from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+from utils import PeftLoraLoaderMixinTests  # noqa: E402


@require_peft_backend
@@ -132,41 +130,3 @@ class Lumina2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
    @unittest.skip("Text encoder LoRA is not supported in Lumina2.")
    def test_simple_inference_with_text_lora_save_load(self):
        pass
-
-    @skip_mps
-    @pytest.mark.xfail(
-        condition=torch.device(torch_device).type == "cpu" and is_torch_version(">=", "2.5"),
-        reason="Test currently fails on CPU and PyTorch 2.5.1 but not on PyTorch 2.4.1.",
-        strict=False,
-    )
-    def test_lora_fuse_nan(self):
-        for scheduler_cls in self.scheduler_classes:
-            components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
-            pipe = self.pipeline_class(**components)
-            pipe = pipe.to(torch_device)
-            pipe.set_progress_bar_config(disable=None)
-            _, _, inputs = self.get_dummy_inputs(with_generator=False)
-
-            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
-                pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
-                self.assertTrue(
-                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
-                )
-
-            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
-            denoiser.add_adapter(denoiser_lora_config, "adapter-1")
-            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
-
-            # corrupt one LoRA weight with `inf` values
-            with torch.no_grad():
-                pipe.transformer.layers[0].attn.to_q.lora_A["adapter-1"].weight += float("inf")
-
-            # with `safe_fusing=True` we should see an Error
-            with self.assertRaises(ValueError):
-                pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
-
-            # without we should not see an error, but every image will be black
-            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
-            out = pipe(**inputs)[0]
-
-            self.assertTrue(np.isnan(out).all())
@@ -1169,16 +1169,17 @@ class ModelTesterMixin:
        base_output = model(**inputs_dict)

        model_size = compute_module_sizes(model)[""]
-        max_size = int(self.model_split_percents[0] * model_size)
-        # Force disk offload by setting very small CPU memory
-        max_memory = {0: max_size, "cpu": int(0.1 * max_size)}
-
        with tempfile.TemporaryDirectory() as tmp_dir:
            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
+
            with self.assertRaises(ValueError):
+                max_size = int(self.model_split_percents[0] * model_size)
+                max_memory = {0: max_size, "cpu": max_size}
                # This errors out because it's missing an offload folder
                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)

+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
            new_model = self.model_class.from_pretrained(
                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
            )
@@ -30,7 +30,6 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
    model_class = OmniGenTransformer2DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
-    model_split_percents = [0.1, 0.1, 0.1]

    @property
    def dummy_input(self):
@@ -74,9 +73,9 @@ class OmniGenTransformerTests(ModelTesterMixin, unittest.TestCase):
            "num_attention_heads": 4,
            "num_key_value_heads": 4,
            "intermediate_size": 32,
-            "num_layers": 20,
+            "num_layers": 1,
            "pad_token_id": 0,
-            "vocab_size": 1000,
+            "vocab_size": 100,
            "in_channels": 4,
            "time_step_dim": 4,
            "rope_scaling": {"long_factor": list(range(1, 3)), "short_factor": list(range(1, 3))},
@@ -33,7 +33,6 @@ enable_full_determinism()
 class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
    model_class = SD3Transformer2DModel
    main_input_name = "hidden_states"
-    model_split_percents = [0.8, 0.8, 0.9]

    @property
    def dummy_input(self):
@@ -68,7 +67,7 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
            "sample_size": 32,
            "patch_size": 1,
            "in_channels": 4,
-            "num_layers": 4,
+            "num_layers": 1,
            "attention_head_dim": 8,
            "num_attention_heads": 4,
            "caption_projection_dim": 32,
@@ -108,7 +107,6 @@ class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
 class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
    model_class = SD3Transformer2DModel
    main_input_name = "hidden_states"
-    model_split_percents = [0.8, 0.8, 0.9]

    @property
    def dummy_input(self):
@@ -143,7 +141,7 @@ class SD35TransformerTests(ModelTesterMixin, unittest.TestCase):
            "sample_size": 32,
            "patch_size": 1,
            "in_channels": 4,
-            "num_layers": 4,
+            "num_layers": 2,
            "attention_head_dim": 8,
            "num_attention_heads": 4,
            "caption_projection_dim": 32,
@@ -89,9 +89,7 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            sample_size=128,
        )
        torch.manual_seed(0)
-        text_encoder = ChatGLMModel.from_pretrained(
-            "hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
-        )
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")

        components = {
@@ -93,9 +93,7 @@ class KolorsPipelineImg2ImgFastTests(PipelineTesterMixin, unittest.TestCase):
            sample_size=128,
        )
        torch.manual_seed(0)
-        text_encoder = ChatGLMModel.from_pretrained(
-            "hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
-        )
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")

        components = {
@@ -91,10 +91,10 @@ class Lumina2Text2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTester
        text_encoder = Gemma2Model(config)

        components = {
-            "transformer": transformer,
+            "transformer": transformer.eval(),
            "vae": vae.eval(),
            "scheduler": scheduler,
-            "text_encoder": text_encoder,
+            "text_encoder": text_encoder.eval(),
            "tokenizer": tokenizer,
        }
        return components
@@ -1,5 +1,5 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 # More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
+# Marigold project website: https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
 import gc
 import random
@@ -1,571 +0,0 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
-# --------------------------------------------------------------------------
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-import diffusers
-from diffusers import (
-    AutoencoderKL,
-    AutoencoderTiny,
-    DDIMScheduler,
-    MarigoldIntrinsicsPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from ..test_pipelines_common import PipelineTesterMixin, to_np
-
-
-enable_full_determinism()
-
-
-class MarigoldIntrinsicsPipelineTesterMixin(PipelineTesterMixin):
-    def _test_inference_batch_single_identical(
-        self,
-        batch_size=2,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for components in pipe.components.values():
-            if hasattr(components, "set_default_attn_processor"):
-                components.set_default_attn_processor()
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is has been used in self.get_dummy_inputs
-        inputs["generator"] = self.get_generator(0)
-
-        logger = diffusers.logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        batched_inputs = {}
-        batched_inputs.update(inputs)
-
-        for name in self.batch_params:
-            if name not in inputs:
-                continue
-
-            value = inputs[name]
-            if name == "prompt":
-                len_prompt = len(value)
-                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                batched_inputs[name][-1] = 100 * "very long"
-
-            else:
-                batched_inputs[name] = batch_size * [value]
-
-        if "generator" in inputs:
-            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-
-        if "batch_size" in inputs:
-            batched_inputs["batch_size"] = batch_size
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-
-        output = pipe(**inputs)
-        output_batch = pipe(**batched_inputs)
-
-        assert output_batch[0].shape[0] == batch_size * output[0].shape[0]  # only changed here
-
-        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
-        assert max_diff < expected_max_diff
-
-    def _test_inference_batch_consistent(
-        self, batch_sizes=[2], additional_params_copy_to_batched_inputs=["num_inference_steps"], batch_generator=True
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["generator"] = self.get_generator(0)
-
-        logger = diffusers.logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # prepare batched inputs
-        batched_inputs = []
-        for batch_size in batch_sizes:
-            batched_input = {}
-            batched_input.update(inputs)
-
-            for name in self.batch_params:
-                if name not in inputs:
-                    continue
-
-                value = inputs[name]
-                if name == "prompt":
-                    len_prompt = len(value)
-                    # make unequal batch sizes
-                    batched_input[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-
-                    # make last batch super long
-                    batched_input[name][-1] = 100 * "very long"
-
-                else:
-                    batched_input[name] = batch_size * [value]
-
-            if batch_generator and "generator" in inputs:
-                batched_input["generator"] = [self.get_generator(i) for i in range(batch_size)]
-
-            if "batch_size" in inputs:
-                batched_input["batch_size"] = batch_size
-
-            batched_inputs.append(batched_input)
-
-        logger.setLevel(level=diffusers.logging.WARNING)
-        for batch_size, batched_input in zip(batch_sizes, batched_inputs):
-            output = pipe(**batched_input)
-            assert len(output[0]) == batch_size * pipe.n_targets  # only changed here
-
-
-class MarigoldIntrinsicsPipelineFastTests(MarigoldIntrinsicsPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = MarigoldIntrinsicsPipeline
-    params = frozenset(["image"])
-    batch_params = frozenset(["image"])
-    image_params = frozenset(["image"])
-    image_latents_params = frozenset(["latents"])
-    callback_cfg_params = frozenset([])
-    test_xformers_attention = False
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "output_type",
-        ]
-    )
-
-    def get_dummy_components(self, time_cond_proj_dim=None):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            time_cond_proj_dim=time_cond_proj_dim,
-            sample_size=32,
-            in_channels=12,
-            out_channels=8,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            thresholding=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "prediction_type": "intrinsics",
-        }
-        return components
-
-    def get_dummy_tiny_autoencoder(self):
-        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
-
-    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image / 2 + 0.5
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image": image,
-            "num_inference_steps": 1,
-            "processing_resolution": 0,
-            "generator": generator,
-            "output_type": "np",
-        }
-        return inputs
-
-    def _test_marigold_intrinsics(
-        self,
-        generator_seed: int = 0,
-        expected_slice: np.ndarray = None,
-        atol: float = 1e-4,
-        **pipe_kwargs,
-    ):
-        device = "cpu"
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe_inputs = self.get_dummy_inputs(device, seed=generator_seed)
-        pipe_inputs.update(**pipe_kwargs)
-
-        prediction = pipe(**pipe_inputs).prediction
-
-        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
-
-        if pipe_inputs.get("match_input_resolution", True):
-            self.assertEqual(prediction.shape, (2, 32, 32, 3), "Unexpected output resolution")
-        else:
-            self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
-            self.assertEqual(
-                max(prediction.shape[1:3]),
-                pipe_inputs.get("processing_resolution", 768),
-                "Unexpected output resolution",
-            )
-
-        np.set_printoptions(precision=5, suppress=True)
-        msg = f"{prediction_slice}"
-        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
-        # self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
-
-    def test_marigold_depth_dummy_defaults(self):
-        self._test_marigold_intrinsics(
-            expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P32_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.6423, 0.40664, 0.41185, 0.65832, 0.63935, 0.43971, 0.51786, 0.55216, 0.47683]),
-            num_inference_steps=1,
-            processing_resolution=32,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.53132, 0.44487, 0.40164, 0.5326, 0.49073, 0.46979, 0.53324, 0.51366, 0.50387]),
-            num_inference_steps=1,
-            processing_resolution=16,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G2024_S1_P32_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=2024,
-            expected_slice=np.array([0.40257, 0.39468, 0.51373, 0.4161, 0.40162, 0.58535, 0.43581, 0.47834, 0.48951]),
-            num_inference_steps=1,
-            processing_resolution=32,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S2_P32_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.49636, 0.4518, 0.42722, 0.59044, 0.6362, 0.39011, 0.53522, 0.55153, 0.48699]),
-            num_inference_steps=2,
-            processing_resolution=32,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P64_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.55547, 0.43511, 0.4887, 0.56399, 0.63867, 0.56337, 0.47889, 0.52925, 0.49235]),
-            num_inference_steps=1,
-            processing_resolution=64,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P32_E3_B1_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.57249, 0.49824, 0.54438, 0.57733, 0.52404, 0.5255, 0.56493, 0.56336, 0.48579]),
-            num_inference_steps=1,
-            processing_resolution=32,
-            ensemble_size=3,
-            ensembling_kwargs={"reduction": "mean"},
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P32_E4_B2_M1(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.6294, 0.5575, 0.53414, 0.61077, 0.57156, 0.53974, 0.52956, 0.55467, 0.48751]),
-            num_inference_steps=1,
-            processing_resolution=32,
-            ensemble_size=4,
-            ensembling_kwargs={"reduction": "mean"},
-            batch_size=2,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_depth_dummy_G0_S1_P16_E1_B1_M0(self):
-        self._test_marigold_intrinsics(
-            generator_seed=0,
-            expected_slice=np.array([0.63511, 0.68137, 0.48783, 0.46689, 0.58505, 0.36757, 0.58465, 0.54302, 0.50387]),
-            num_inference_steps=1,
-            processing_resolution=16,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=False,
-        )
-
-    def test_marigold_depth_dummy_no_num_inference_steps(self):
-        with self.assertRaises(ValueError) as e:
-            self._test_marigold_intrinsics(
-                num_inference_steps=None,
-                expected_slice=np.array([0.0]),
-            )
-            self.assertIn("num_inference_steps", str(e))
-
-    def test_marigold_depth_dummy_no_processing_resolution(self):
-        with self.assertRaises(ValueError) as e:
-            self._test_marigold_intrinsics(
-                processing_resolution=None,
-                expected_slice=np.array([0.0]),
-            )
-            self.assertIn("processing_resolution", str(e))
-
-
-@slow
-@require_torch_gpu
-class MarigoldIntrinsicsPipelineIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def _test_marigold_intrinsics(
-        self,
-        is_fp16: bool = True,
-        device: str = "cuda",
-        generator_seed: int = 0,
-        expected_slice: np.ndarray = None,
-        model_id: str = "prs-eth/marigold-iid-appearance-v1-1",
-        image_url: str = "https://marigoldmonodepth.github.io/images/einstein.jpg",
-        atol: float = 1e-4,
-        **pipe_kwargs,
-    ):
-        from_pretrained_kwargs = {}
-        if is_fp16:
-            from_pretrained_kwargs["variant"] = "fp16"
-            from_pretrained_kwargs["torch_dtype"] = torch.float16
-
-        pipe = MarigoldIntrinsicsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
-        if device == "cuda":
-            pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(generator_seed)
-
-        image = load_image(image_url)
-        width, height = image.size
-
-        prediction = pipe(image, generator=generator, **pipe_kwargs).prediction
-
-        prediction_slice = prediction[0, -3:, -3:, -1].flatten()
-
-        if pipe_kwargs.get("match_input_resolution", True):
-            self.assertEqual(prediction.shape, (2, height, width, 3), "Unexpected output resolution")
-        else:
-            self.assertTrue(prediction.shape[0] == 2 and prediction.shape[3] == 3, "Unexpected output dimensions")
-            self.assertEqual(
-                max(prediction.shape[1:3]),
-                pipe_kwargs.get("processing_resolution", 768),
-                "Unexpected output resolution",
-            )
-
-        msg = f"{prediction_slice}"
-        self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol), msg)
-        # self.assertTrue(np.allclose(prediction_slice, expected_slice, atol=atol))
-
-    def test_marigold_intrinsics_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=False,
-            device="cpu",
-            generator_seed=0,
-            expected_slice=np.array([0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162, 0.9162]),
-            num_inference_steps=1,
-            processing_resolution=32,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=False,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.62127, 0.61906, 0.61687, 0.61946, 0.61903, 0.61961, 0.61808, 0.62099, 0.62894]),
-            num_inference_steps=1,
-            processing_resolution=768,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.62109, 0.61914, 0.61719, 0.61963, 0.61914, 0.61963, 0.61816, 0.62109, 0.62891]),
-            num_inference_steps=1,
-            processing_resolution=768,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=2024,
-            expected_slice=np.array([0.64111, 0.63916, 0.63623, 0.63965, 0.63916, 0.63965, 0.6377, 0.64062, 0.64941]),
-            num_inference_steps=1,
-            processing_resolution=768,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.60254, 0.60059, 0.59961, 0.60156, 0.60107, 0.60205, 0.60254, 0.60449, 0.61133]),
-            num_inference_steps=2,
-            processing_resolution=768,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.64551, 0.64453, 0.64404, 0.64502, 0.64844, 0.65039, 0.64502, 0.65039, 0.65332]),
-            num_inference_steps=1,
-            processing_resolution=512,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.61572, 0.61377, 0.61182, 0.61426, 0.61377, 0.61426, 0.61279, 0.61572, 0.62354]),
-            num_inference_steps=1,
-            processing_resolution=768,
-            ensemble_size=3,
-            ensembling_kwargs={"reduction": "mean"},
-            batch_size=1,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.61914, 0.6167, 0.61475, 0.61719, 0.61719, 0.61768, 0.61572, 0.61914, 0.62695]),
-            num_inference_steps=1,
-            processing_resolution=768,
-            ensemble_size=4,
-            ensembling_kwargs={"reduction": "mean"},
-            batch_size=2,
-            match_input_resolution=True,
-        )
-
-    def test_marigold_intrinsics_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
-        self._test_marigold_intrinsics(
-            is_fp16=True,
-            device="cuda",
-            generator_seed=0,
-            expected_slice=np.array([0.65332, 0.64697, 0.64648, 0.64844, 0.64697, 0.64111, 0.64941, 0.64209, 0.65332]),
-            num_inference_steps=1,
-            processing_resolution=512,
-            ensemble_size=1,
-            batch_size=1,
-            match_input_resolution=False,
-        )
@@ -1,5 +1,5 @@
-# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
-# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 # --------------------------------------------------------------------------
 # More information and citation instructions are available on the
-# Marigold project website: https://marigoldcomputervision.github.io
+# Marigold project website: https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
 import gc
 import random
@@ -98,9 +98,7 @@ class KolorsPAGPipelineFastTests(
            sample_size=128,
        )
        torch.manual_seed(0)
-        text_encoder = ChatGLMModel.from_pretrained(
-            "hf-internal-testing/tiny-random-chatglm3-6b", torch_dtype=torch.bfloat16
-        )
+        text_encoder = ChatGLMModel.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")
        tokenizer = ChatGLMTokenizer.from_pretrained("hf-internal-testing/tiny-random-chatglm3-6b")

        components = {
@@ -527,9 +527,7 @@ class FluxIPAdapterTesterMixin:

        The following scenarios are tested:
          - Single IP-Adapter with scale=0 should produce same output as no IP-Adapter.
-          - Multi IP-Adapter with scale=0 should produce same output as no IP-Adapter.
          - Single IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
-          - Multi IP-Adapter with scale!=0 should produce different output compared to no IP-Adapter.
        """
        # Raising the tolerance for this test when it's run on a CPU because we
        # compare against static slices and that can be shaky (with a VVVV low probability).
@@ -547,7 +545,6 @@ class FluxIPAdapterTesterMixin:
        else:
            output_without_adapter = expected_pipe_slice

-        # 1. Single IP-Adapter test cases
        adapter_state_dict = create_flux_ip_adapter_state_dict(pipe.transformer)
        pipe.transformer._load_ip_adapter_weights(adapter_state_dict)

@@ -581,44 +578,6 @@ class FluxIPAdapterTesterMixin:
            max_diff_with_adapter_scale, 1e-2, "Output with ip-adapter must be different from normal inference"
        )

-        # 2. Multi IP-Adapter test cases
-        adapter_state_dict_1 = create_flux_ip_adapter_state_dict(pipe.transformer)
-        adapter_state_dict_2 = create_flux_ip_adapter_state_dict(pipe.transformer)
-        pipe.transformer._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
-
-        # forward pass with multi ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
-        inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
-        pipe.set_ip_adapter_scale([0.0, 0.0])
-        output_without_multi_adapter_scale = pipe(**inputs)[0]
-        if expected_pipe_slice is not None:
-            output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
-
-        # forward pass with multi ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
-        inputs["negative_ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(image_embed_dim)] * 2
-        pipe.set_ip_adapter_scale([42.0, 42.0])
-        output_with_multi_adapter_scale = pipe(**inputs)[0]
-        if expected_pipe_slice is not None:
-            output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
-
-        max_diff_without_multi_adapter_scale = np.abs(
-            output_without_multi_adapter_scale - output_without_adapter
-        ).max()
-        max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
-        self.assertLess(
-            max_diff_without_multi_adapter_scale,
-            expected_max_diff,
-            "Output without multi-ip-adapter must be same as normal inference",
-        )
-        self.assertGreater(
-            max_diff_with_multi_adapter_scale,
-            1e-2,
-            "Output with multi-ip-adapter scale must be different from normal inference",
-        )
-

 class PipelineLatentTesterMixin:
    """
@@ -1,61 +0,0 @@
-import argparse
-import inspect
-import sys
-from pathlib import Path
-from typing import List, Type
-
-
-root_dir = Path(__file__).parent.parent.absolute()
-sys.path.insert(0, str(root_dir))
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--type", type=str, default=None)
-args = parser.parse_args()
-
-
-def get_test_methods_from_class(cls: Type) -> List[str]:
-    """
-    Get all test method names from a given class.
-    Only returns methods that start with 'test_'.
-    """
-    test_methods = []
-    for name, obj in inspect.getmembers(cls):
-        if name.startswith("test_") and inspect.isfunction(obj):
-            test_methods.append(name)
-    return sorted(test_methods)
-
-
-def generate_pytest_pattern(test_methods: List[str]) -> str:
-    """Generate pytest pattern string for the -k flag."""
-    return " or ".join(test_methods)
-
-
-def generate_pattern_for_mixin(mixin_class: Type) -> str:
-    """
-    Generate pytest pattern for a specific mixin class.
-    """
-    if mixin_cls is None:
-        return ""
-    test_methods = get_test_methods_from_class(mixin_class)
-    return generate_pytest_pattern(test_methods)
-
-
-if __name__ == "__main__":
-    mixin_cls = None
-    if args.type == "pipeline":
-        from tests.pipelines.test_pipelines_common import PipelineTesterMixin
-
-        mixin_cls = PipelineTesterMixin
-
-    elif args.type == "models":
-        from tests.models.test_modeling_common import ModelTesterMixin
-
-        mixin_cls = ModelTesterMixin
-
-    elif args.type == "lora":
-        from tests.lora.utils import PeftLoraLoaderMixinTests
-
-        mixin_cls = PeftLoraLoaderMixinTests
-
-    pattern = generate_pattern_for_mixin(mixin_cls)
-    print(pattern)