Merge branch 'main' into layerwise-upcasting

[CI] Multiple Slow Test fixes. (#9198 )
* update * update * update * update
2024-08-19 14:22:12 +05:30 · 2024-08-19 13:31:09 +05:30 · 2024-08-19 11:35:22 +05:30 · 2024-08-19 09:26:27 +05:30 · 2024-08-19 09:13:18 +05:30 · 2024-08-19 08:29:03 +05:30
264 changed files with 28262 additions and 2422 deletions
@@ -13,13 +13,13 @@ env:

 jobs:
  torch_pipelines_cuda_benchmark_tests:
-    env: 
+    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }}
    name: Torch Core Pipelines CUDA Benchmarking Tests
    strategy:
      fail-fast: false
      max-parallel: 1
-    runs-on: 
+    runs-on:
      group: aws-g6-4xlarge-plus
    container:
      image: diffusers/diffusers-pytorch-compile-cuda
@@ -59,7 +59,7 @@ jobs:
        if: ${{ success() }}
        run: |
          pip install requests && python utils/notify_benchmarking_status.py --status=success
-        
+
      - name: Report failure status
        if: ${{ failure() }}
        run: |
@@ -21,7 +21,7 @@ env:
 jobs:
  test-build-docker-images:
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -52,7 +52,7 @@ jobs:

  build-and-push-docker-images:
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    if: github.event_name != 'pull_request'

    permissions:
@@ -24,7 +24,7 @@ jobs:
  mirror_community_pipeline:
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }}
-    
+
    runs-on: ubuntu-latest
    steps:
      # Checkout to correct ref
@@ -95,7 +95,7 @@ jobs:
        if: ${{ success() }}
        run: |
          pip install requests && python utils/notify_community_pipelines_mirror.py --status=success
-      
+
      - name: Report failure status
        if: ${{ failure() }}
        run: |
@@ -20,7 +20,7 @@ jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -32,7 +32,7 @@ jobs:
          fetch-depth: 2
      - name: Install dependencies
        run: |
-          pip install -e .
+          pip install -e .[test]
          pip install huggingface_hub
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
@@ -16,7 +16,7 @@ jobs:
  setup_pr_tests:
    name: Setup PR Tests
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -75,7 +75,7 @@ jobs:
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -125,7 +125,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -72,7 +72,7 @@ jobs:
    name: LoRA - ${{ matrix.lib-versions }}

    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -82,17 +82,17 @@ jobs:
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -182,7 +182,7 @@ jobs:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
            runner:
-              group: aws-general-8-plus-cache
+              group: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -20,7 +20,7 @@ jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
    runs-on:
-      group: aws-general-8-plus-cache
+      group: aws-general-8-plus
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -29,22 +29,22 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: aws-general-8-plus-cache
+            runner: aws-general-8-plus
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

@@ -63,7 +63,7 @@ In the same spirit, you are of immense help to the community by answering such q

 **Please** keep in mind that the more effort you put into asking or answering a question, the higher
 the quality of the publicly documented knowledge. In the same way, well-posed and well-answered questions create a high-quality knowledge database accessible to everybody, while badly posed questions or answers reduce the overall quality of the public knowledge database.
-In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formated/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.
+In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formatted/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section.

 **NOTE about channels**:
 [*The forum*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) is much better indexed by search engines, such as Google. Posts are ranked by popularity rather than chronologically. Hence, it's easier to look up questions and answers that we posted some time ago.
@@ -67,7 +67,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 27.000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 30,000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -202,6 +202,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9

 - https://github.com/microsoft/TaskMatrix
 - https://github.com/invoke-ai/InvokeAI
+- https://github.com/InstantID/InstantID
 - https://github.com/apple/ml-stable-diffusion
 - https://github.com/Sanster/lama-cleaner
 - https://github.com/IDEA-Research/Grounded-Segment-Anything
@@ -209,7 +210,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +12.000 other amazing GitHub repositories 💪
+- +14,000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

@@ -190,6 +190,10 @@
  - local: conceptual/evaluation
    title: Evaluating Diffusion Models
  title: Conceptual Guides
+- sections:
+  - local: community_projects
+    title: Projects built with Diffusers
+  title: Community Projects
 - sections:
  - isExpanded: false
    sections:
@@ -219,60 +223,76 @@
    sections:
    - local: api/models/overview
      title: Overview
-    - local: api/models/unet
-      title: UNet1DModel
-    - local: api/models/unet2d
-      title: UNet2DModel
-    - local: api/models/unet2d-cond
-      title: UNet2DConditionModel
-    - local: api/models/unet3d-cond
-      title: UNet3DConditionModel
-    - local: api/models/unet-motion
-      title: UNetMotionModel
-    - local: api/models/uvit2d
-      title: UViT2DModel
-    - local: api/models/vq
-      title: VQModel
-    - local: api/models/autoencoderkl
-      title: AutoencoderKL
-    - local: api/models/asymmetricautoencoderkl
-      title: AsymmetricAutoencoderKL
-    - local: api/models/autoencoder_tiny
-      title: Tiny AutoEncoder
-    - local: api/models/autoencoder_oobleck
-      title: Oobleck AutoEncoder
-    - local: api/models/consistency_decoder_vae
-      title: ConsistencyDecoderVAE
-    - local: api/models/transformer2d
-      title: Transformer2DModel
-    - local: api/models/pixart_transformer2d
-      title: PixArtTransformer2DModel
-    - local: api/models/dit_transformer2d
-      title: DiTTransformer2DModel
-    - local: api/models/hunyuan_transformer2d
-      title: HunyuanDiT2DModel
-    - local: api/models/aura_flow_transformer2d
-      title: AuraFlowTransformer2DModel
-    - local: api/models/latte_transformer3d
-      title: LatteTransformer3DModel
-    - local: api/models/lumina_nextdit2d
-      title: LuminaNextDiT2DModel
-    - local: api/models/transformer_temporal
-      title: TransformerTemporalModel
-    - local: api/models/sd3_transformer2d
-      title: SD3Transformer2DModel
-    - local: api/models/stable_audio_transformer
-      title: StableAudioDiTModel
-    - local: api/models/prior_transformer
-      title: PriorTransformer
-    - local: api/models/controlnet
-      title: ControlNetModel
-    - local: api/models/controlnet_hunyuandit
-      title: HunyuanDiT2DControlNetModel
-    - local: api/models/controlnet_sd3
-      title: SD3ControlNetModel
-    - local: api/models/controlnet_sparsectrl
-      title: SparseControlNetModel
+    - sections:
+      - local: api/models/controlnet
+        title: ControlNetModel
+      - local: api/models/controlnet_hunyuandit
+        title: HunyuanDiT2DControlNetModel
+      - local: api/models/controlnet_sd3
+        title: SD3ControlNetModel
+      - local: api/models/controlnet_sparsectrl
+        title: SparseControlNetModel
+      title: ControlNets
+    - sections:
+      - local: api/models/aura_flow_transformer2d
+        title: AuraFlowTransformer2DModel
+      - local: api/models/cogvideox_transformer3d
+        title: CogVideoXTransformer3DModel
+      - local: api/models/dit_transformer2d
+        title: DiTTransformer2DModel
+      - local: api/models/flux_transformer
+        title: FluxTransformer2DModel
+      - local: api/models/hunyuan_transformer2d
+        title: HunyuanDiT2DModel
+      - local: api/models/latte_transformer3d
+        title: LatteTransformer3DModel
+      - local: api/models/lumina_nextdit2d
+        title: LuminaNextDiT2DModel
+      - local: api/models/pixart_transformer2d
+        title: PixArtTransformer2DModel
+      - local: api/models/prior_transformer
+        title: PriorTransformer
+      - local: api/models/sd3_transformer2d
+        title: SD3Transformer2DModel
+      - local: api/models/stable_audio_transformer
+        title: StableAudioDiTModel
+      - local: api/models/transformer2d
+        title: Transformer2DModel
+      - local: api/models/transformer_temporal
+        title: TransformerTemporalModel
+      title: Transformers
+    - sections:
+      - local: api/models/stable_cascade_unet
+        title: StableCascadeUNet
+      - local: api/models/unet
+        title: UNet1DModel
+      - local: api/models/unet2d
+        title: UNet2DModel
+      - local: api/models/unet2d-cond
+        title: UNet2DConditionModel
+      - local: api/models/unet3d-cond
+        title: UNet3DConditionModel
+      - local: api/models/unet-motion
+        title: UNetMotionModel
+      - local: api/models/uvit2d
+        title: UViT2DModel
+      title: UNets
+    - sections:
+      - local: api/models/autoencoderkl
+        title: AutoencoderKL
+      - local: api/models/autoencoderkl_cogvideox
+        title: AutoencoderKLCogVideoX
+      - local: api/models/asymmetricautoencoderkl
+        title: AsymmetricAutoencoderKL
+      - local: api/models/consistency_decoder_vae
+        title: ConsistencyDecoderVAE
+      - local: api/models/autoencoder_oobleck
+        title: Oobleck AutoEncoder
+      - local: api/models/autoencoder_tiny
+        title: Tiny AutoEncoder
+      - local: api/models/vq
+        title: VQModel
+      title: VAEs
    title: Models
  - isExpanded: false
    sections:
@@ -294,6 +314,8 @@
      title: AutoPipeline
    - local: api/pipelines/blip_diffusion
      title: BLIP-Diffusion
+    - local: api/pipelines/cogvideox
+      title: CogVideoX
    - local: api/pipelines/consistency_models
      title: Consistency Models
    - local: api/pipelines/controlnet
@@ -320,6 +342,8 @@
      title: DiffEdit
    - local: api/pipelines/dit
      title: DiT
+    - local: api/pipelines/flux
+      title: Flux
    - local: api/pipelines/hunyuandit
      title: Hunyuan-DiT
    - local: api/pipelines/i2vgenxl
@@ -22,6 +22,7 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:

 ## Supported pipelines

+- [`CogVideoXPipeline`]
 - [`StableDiffusionPipeline`]
 - [`StableDiffusionImg2ImgPipeline`]
 - [`StableDiffusionInpaintPipeline`]
@@ -49,8 +50,10 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 - [`UNet2DConditionModel`]
 - [`StableCascadeUNet`]
 - [`AutoencoderKL`]
+- [`AutoencoderKLCogVideoX`]
 - [`ControlNetModel`]
 - [`SD3Transformer2DModel`]
+- [`FluxTransformer2DModel`]

 ## FromSingleFileMixin

@@ -0,0 +1,37 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLCogVideoX
+
+The 3D variational autoencoder (VAE) model with KL loss used in [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLCogVideoX
+
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+```
+
+## AutoencoderKLCogVideoX
+
+[[autodoc]] AutoencoderKLCogVideoX
+    - decode
+    - encode
+    - all
+
+## AutoencoderKLOutput
+
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,30 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# CogVideoXTransformer3DModel
+
+A Diffusion Transformer model for 3D data from [CogVideoX](https://github.com/THUDM/CogVideo) was introduced in [CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import CogVideoXTransformer3DModel
+
+vae = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+```
+
+## CogVideoXTransformer3DModel
+
+[[autodoc]] CogVideoXTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FluxTransformer2DModel
+
+A Transformer model for image-like data from [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).
+
+## FluxTransformer2DModel
+
+[[autodoc]] FluxTransformer2DModel
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# StableCascadeUNet
+
+A UNet model from the [Stable Cascade pipeline](../pipelines/stable_cascade.md).
+
+## StableCascadeUNet
+
+[[autodoc]] models.unets.unet_stable_cascade.StableCascadeUNet
@@ -18,7 +18,7 @@ It was developed by the Fal team and more details about it can be found in [this

 <Tip>

-AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. 
+AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details.

 </Tip>

@@ -0,0 +1,88 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+-->
+
+# CogVideoX
+
+[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.
+
+The abstract from the paper is:
+
+*We introduce CogVideoX, a large-scale diffusion transformer model designed for generating videos based on text prompts. To efficently model video data, we propose to levearge a 3D Variational Autoencoder (VAE) to compresses videos along both spatial and temporal dimensions. To improve the text-video alignment, we propose an expert transformer with the expert adaptive LayerNorm to facilitate the deep fusion between the two modalities. By employing a progressive training technique, CogVideoX is adept at producing coherent, long-duration videos characterized by significant motion. In addition, we develop an effectively text-video data processing pipeline that includes various data preprocessing strategies and a video captioning method. It significantly helps enhance the performance of CogVideoX, improving both generation quality and semantic alignment. Results show that CogVideoX demonstrates state-of-the-art performance across both multiple machine metrics and human evaluations. The model weight of CogVideoX-2B is publicly available at https://github.com/THUDM/CogVideo.*
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
+
+## Inference
+
+Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
+
+First, load the pipeline:
+
+```python
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b").to("cuda")
+```
+
+Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
+
+```python
+pipe.transformer.to(memory_format=torch.channels_last)
+```
+
+Finally, compile the components and run inference:
+
+```python
+pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
+
+# CogVideoX works well with long and well-described prompts
+prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+```
+
+The [benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
+
+```
+Without torch.compile(): Average inference time: 96.89 seconds.
+With torch.compile(): Average inference time: 76.27 seconds.
+```
+
+### Memory optimization
+
+CogVideoX requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
+
+- `pipe.enable_model_cpu_offload()`:
+  - Without enabling cpu offloading, memory usage is `33 GB`
+  - With enabling cpu offloading, memory usage is `19 GB`
+- `pipe.vae.enable_tiling()`:
+  - With enabling cpu offloading and tiling, memory usage is `11 GB`
+- `pipe.vae.enable_slicing()`
+
+## CogVideoXPipeline
+
+[[autodoc]] CogVideoXPipeline
+  - all
+  - __call__
+
+## CogVideoXPipelineOutput
+
+[[autodoc]] pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -22,7 +22,16 @@ The abstract from the paper is:

 *We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*

-This code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for SD3-ControlNet on [The InstantX Team](https://huggingface.co/InstantX) Hub profile.
+This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
+
+
+| ControlNet type | Developer | Link |
+| -------- | ---------- | ---- |
+| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Canny) |
+| Pose | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Pose) |
+| Tile | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Tile) |
+| Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
+

 <Tip>

@@ -35,5 +44,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

+## StableDiffusion3ControlNetInpaintingPipeline
+[[autodoc]] pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline
+	- all
+	- __call__
+
 ## StableDiffusion3PipelineOutput
 [[autodoc]] pipelines.stable_diffusion_3.pipeline_output.StableDiffusion3PipelineOutput
@@ -0,0 +1,165 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Flux
+
+Flux is a series of text-to-image generation models based on diffusion transformers. To know more about Flux, check out the original [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/) by the creators of Flux, Black Forest Labs.
+
+Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux).
+
+<Tip>
+
+Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.  For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
+
+</Tip>
+
+Flux comes in two variants:
+
+* Timestep-distilled (`black-forest-labs/FLUX.1-schnell`)
+* Guidance-distilled (`black-forest-labs/FLUX.1-dev`)
+
+Both checkpoints have slightly difference usage which we detail below.
+
+### Timestep-distilled
+
+* `max_sequence_length` cannot be more than 256.
+* `guidance_scale` needs to be 0.
+* As this is a timestep-distilled model, it benefits from fewer sampling steps.
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=0.,
+    height=768,
+    width=1360,
+    num_inference_steps=4,
+    max_sequence_length=256,
+).images[0]
+out.save("image.png")
+```
+
+### Guidance-distilled
+
+* The guidance-distilled variant takes about 50 sampling steps for good-quality generation.
+* It doesn't have any limitations around the `max_sequence_length`.
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()
+
+prompt = "a tiny astronaut hatching from an egg on the moon"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=3.5,
+    height=768,
+    width=1360,
+    num_inference_steps=50,
+).images[0]
+out.save("image.png")
+```
+
+## Running FP16 inference
+Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
+
+FP16 inference code:
+```python
+import torch
+from diffusers import FluxPipeline
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) # can replace schnell with dev
+# to run on low vram GPUs (i.e. between 4 and 32 GB VRAM)
+pipe.enable_sequential_cpu_offload()
+pipe.vae.enable_slicing()
+pipe.vae.enable_tiling()
+
+pipe.to(torch.float16) # casting here instead of in the pipeline constructor because doing so in the constructor loads all models into CPU memory at once
+
+prompt = "A cat holding a sign that says hello world"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=0.,
+    height=768,
+    width=1360,
+    num_inference_steps=4,
+    max_sequence_length=256,
+).images[0]
+out.save("image.png")
+```
+
+## Single File Loading for the `FluxTransformer2DModel`
+
+The `FluxTransformer2DModel` supports loading checkpoints in the original format shipped by Black Forest Labs. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
+
+<Tip>
+`FP8` inference can be brittle depending on the GPU type, CUDA version, and `torch` version that you are using. It is recommended that you use the `optimum-quanto` library in order to run FP8 inference on your machine.
+</Tip>
+
+The following example demonstrates how to run Flux with less than 16GB of VRAM.
+
+First install `optimum-quanto`
+
+```shell
+pip install optimum-quanto
+```
+
+Then run the following example
+
+```python
+import torch
+from diffusers import FluxTransformer2DModel, FluxPipeline
+from transformers import T5EncoderModel, CLIPTextModel
+from optimum.quanto import freeze, qfloat8, quantize
+
+bfl_repo = "black-forest-labs/FLUX.1-dev"
+dtype = torch.bfloat16
+
+transformer = FluxTransformer2DModel.from_single_file("https://huggingface.co/Kijai/flux-fp8/blob/main/flux1-dev-fp8.safetensors", torch_dtype=dtype)
+quantize(transformer, weights=qfloat8)
+freeze(transformer)
+
+text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
+quantize(text_encoder_2, weights=qfloat8)
+freeze(text_encoder_2)
+
+pipe = FluxPipeline.from_pretrained(bfl_repo, transformer=None, text_encoder_2=None, torch_dtype=dtype)
+pipe.transformer = transformer
+pipe.text_encoder_2 = text_encoder_2
+
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+image = pipe(
+    prompt,
+    guidance_scale=3.5,
+    output_type="pil",
+    num_inference_steps=20,
+    generator=torch.Generator("cpu").manual_seed(0)
+).images[0]
+
+image.save("flux-fp8-dev.png")
+```
+
+## FluxPipeline
+
+[[autodoc]] FluxPipeline
+	- all
+	- __call__
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/kolors_header_collage.png)

-Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](kwai-kolors@kuaishou.com). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
+Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](https://github.com/Kwai-Kolors/Kolors). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).

 The abstract from the technical report is:

@@ -74,7 +74,7 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(

 pipe = KolorsPipeline.from_pretrained(
    "Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)

 pipe.load_ip_adapter(
@@ -59,7 +59,7 @@ First, load the pipeline:

 ```python
 from diffusers import LuminaText2ImgPipeline
-import torch 
+import torch

 pipeline = LuminaText2ImgPipeline.from_pretrained(
 	"Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
@@ -87,4 +87,4 @@ image = pipeline(prompt="Upper body of a young woman in a Victorian-era outfit w
 [[autodoc]] LuminaText2ImgPipeline
 	- all
 	- __call__
-	
+
@@ -20,6 +20,34 @@ The abstract from the paper is:

 *Recent studies have demonstrated that diffusion models are capable of generating high-quality samples, but their quality heavily depends on sampling guidance techniques, such as classifier guidance (CG) and classifier-free guidance (CFG). These techniques are often not applicable in unconditional generation or in various downstream tasks such as image restoration. In this paper, we propose a novel sampling guidance, called Perturbed-Attention Guidance (PAG), which improves diffusion sample quality across both unconditional and conditional settings, achieving this without requiring additional training or the integration of external modules. PAG is designed to progressively enhance the structure of samples throughout the denoising process. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, by considering the self-attention mechanisms' ability to capture structural information, and guiding the denoising process away from these degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves sample quality in conditional and even unconditional scenarios. Moreover, PAG significantly improves the baseline performance in various downstream tasks where existing guidances such as CG or CFG cannot be fully utilized, including ControlNet with empty prompts and image restoration such as inpainting and deblurring.*

+PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
+
+- Full identifier as a normal string: `down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor`
+- Full identifier as a RegEx: `down_blocks.2.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor`
+- Partial identifier as a RegEx: `down_blocks.2`, or `attn1`
+- List of identifiers (can be combo of strings and ReGex): `["blocks.1", "blocks.(14|20)", r"down_blocks\.(2,3)"]`
+
+<Tip warning={true}>
+
+Since RegEx is supported as a way for matching layer identifiers, it is crucial to use it correctly otherwise there might be unexpected behaviour. The recommended way to use PAG is by specifying layers as `blocks.{layer_index}` and `blocks.({layer_index_1|layer_index_2|...})`. Using it in any other way, while doable, may bypass our basic validation checks and give you unexpected results.
+
+</Tip>
+
+## AnimateDiffPAGPipeline
+[[autodoc]] AnimateDiffPAGPipeline
+  - all
+  - __call__
+
+## HunyuanDiTPAGPipeline
+[[autodoc]] HunyuanDiTPAGPipeline
+  - all
+  - __call__
+
+## KolorsPAGPipeline
+[[autodoc]] KolorsPAGPipeline
+  - all
+  - __call__
+
 ## StableDiffusionPAGPipeline
 [[autodoc]] StableDiffusionPAGPipeline
 	- all
@@ -49,3 +77,15 @@ The abstract from the paper is:
 [[autodoc]] StableDiffusionXLControlNetPAGPipeline
 	- all
 	- __call__
+
+
+## StableDiffusion3PAGPipeline
+[[autodoc]] StableDiffusion3PAGPipeline
+	- all
+	- __call__
+
+
+## PixArtSigmaPAGPipeline
+[[autodoc]] PixArtSigmaPAGPipeline
+	- all
+	- __call__
@@ -16,12 +16,12 @@ Stable Audio was proposed in [Stable Audio Open](https://arxiv.org/abs/2407.1435

 Stable Audio Open generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.

-Stable Audio is trained on a corpus of around 48k audio recordings, where around 47k are from Freesound and the rest are from the Free Music Archive (FMA). All audio files are licensed under CC0, CC BY, or CC Sampling+. This data is used to train the autoencoder and the DiT. 
+Stable Audio is trained on a corpus of around 48k audio recordings, where around 47k are from Freesound and the rest are from the Free Music Archive (FMA). All audio files are licensed under CC0, CC BY, or CC Sampling+. This data is used to train the autoencoder and the DiT.

 The abstract of the paper is the following:
 *Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*

-This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool).
+This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools).

 ## Tips

@@ -0,0 +1,78 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Community Projects
+
+Welcome to Community Projects. This space is dedicated to showcasing the incredible work and innovative applications created by our vibrant community using the `diffusers` library.
+
+This section aims to:
+
+- Highlight diverse and inspiring projects built with `diffusers`
+- Foster knowledge sharing within our community
+- Provide real-world examples of how `diffusers` can be leveraged
+
+Happy exploring, and thank you for being part of the Diffusers community!
+
+<table>
+    <tr>
+        <th>Project Name</th>
+        <th>Description</th>
+    </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/carson-katri/dream-textures"> dream-textures </a></td>
+    <td>Stable Diffusion built-in to Blender</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/megvii-research/HiDiffusion"> HiDiffusion </a></td>
+    <td>Increases the resolution and speed of your diffusion model by only adding a single line of code</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/lllyasviel/IC-Light"> IC-Light </a></td>
+    <td>IC-Light is a project to manipulate the illumination of images</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/InstantID/InstantID"> InstantID </a></td>
+    <td>InstantID : Zero-shot Identity-Preserving Generation in Seconds</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/Sanster/IOPaint"> IOPaint </a></td>
+    <td>Image inpainting tool powered by SOTA AI Model. Remove any unwanted object, defect, people from your pictures or erase and replace(powered by stable diffusion) any thing on your pictures.</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/bmaltais/kohya_ss"> Kohya </a></td>
+    <td>Gradio GUI for Kohya's Stable Diffusion trainers</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/magic-research/magic-animate"> MagicAnimate </a></td>
+    <td>MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/levihsu/OOTDiffusion"> OOTDiffusion </a></td>
+    <td>Outfitting Fusion based Latent Diffusion for Controllable Virtual Try-on</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/vladmandic/automatic"> SD.Next </a></td>
+    <td>SD.Next: Advanced Implementation of Stable Diffusion and other Diffusion-based generative image models</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/ashawkey/stable-dreamfusion"> stable-dreamfusion </a></td>
+    <td>Text-to-3D & Image-to-3D & Mesh Exportation with NeRF + Diffusion</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/HVision-NKU/StoryDiffusion"> StoryDiffusion </a></td>
+    <td>StoryDiffusion can create a magic story by generating consistent images and videos.</td>
+  </tr>
+  <tr style="border-top: 2px solid black">
+    <td><a href="https://github.com/cumulo-autumn/StreamDiffusion"> StreamDiffusion </a></td>
+    <td>A Pipeline-Level Solution for Real-Time Interactive Generation</td>
+  </tr>
+</table>
@@ -125,3 +125,5 @@ image
    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
  </div>
 </div>
+
+More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
@@ -48,7 +48,7 @@ accelerate launch run_distributed.py --num_processes=2

 <Tip>

-To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.

 </Tip>

@@ -108,4 +108,4 @@ torchrun run_distributed.py --nproc_per_node=2
 ```

 > [!TIP]
-> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
+> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [InstructPix2Pix](https://hf.co/papers/2211.09800) is a Stable Diffusion model trained to edit images from human-provided instructions. For example, your prompt can be "turn the clouds rainy" and the model will edit the input image accordingly. This model is conditioned on the text prompt (or editing instruction) and the input image.

-This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
+This guide will explore the [train_instruct_pix2pix.py](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) training script to help you become familiar with it, and how you can adapt it for your own use case.

 Before running the script, make sure you install the library from source:

@@ -117,7 +117,7 @@ optimizer = optimizer_cls(
 )
 ```

-Next, the edited images and and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.
+Next, the edited images and edit instructions are [preprocessed](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L624) and [tokenized](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L610C24-L610C24). It is important the same image transformations are applied to the original and edited images.

 ```py
 def preprocess_train(examples):
@@ -249,4 +249,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl

 Congratulations on training your own InstructPix2Pix model! 🥳 To learn more about the model, it may be helpful to:

- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
+- Read the [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd) blog post to learn more about some experiments we've done with InstructPix2Pix, dataset preparation, and results for different instructions.
@@ -35,7 +35,7 @@ pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu
 ```

 > [!TIP]
-> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum. 
+> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum.
 > If you're interested in the full benchmarking code, take a look at [huggingface/diffusion-fast](https://github.com/huggingface/diffusion-fast).


@@ -168,7 +168,7 @@ Using SDPA attention and compiling both the UNet and VAE cuts the latency from 3
 </div>

 > [!TIP]
-> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial. 
+> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial.

 ### Prevent graph breaks

@@ -18,13 +18,13 @@ A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/s
 * Two text encoders
 * A UNet for denoising

-Usually, the text encoders and the denoiser are much larger compared to the VAE. 
+Usually, the text encoders and the denoiser are much larger compared to the VAE.

 As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*.

 When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed.

-The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library. 
+The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library.

 > [!TIP]
 > Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory.
@@ -43,7 +43,7 @@ unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
 The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:

 ```python
-from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline 
+from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline
 import torch

 unet = UNet2DConditionModel.from_pretrained(
@@ -57,14 +57,14 @@ image = pipeline("a cute dog running on the grass", num_inference_steps=30).imag
 image.save("dog.png")
 ```

-If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you: 
+If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you:

 ```diff
 - pipeline.to("cuda")
 + pipeline.enable_model_cpu_offload()
 ```

-In general, we recommend sharding when a checkpoint is more than 5GB (in fp32). 
+In general, we recommend sharding when a checkpoint is more than 5GB (in fp32).

 ## Device placement

@@ -34,7 +34,7 @@ pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
 pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
 ```

-Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
+Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.

 ```python
 pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # Pipeline callbacks

-The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
+The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!

 > [!TIP]
 > 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
@@ -75,7 +75,7 @@ out.images[0].save("official_callback.png")
    <figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
  </div>
  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a sports car at the road with cfg callback" />
    <figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
  </div>
 </div>
@@ -256,7 +256,7 @@ make_image_grid([init_image, mask_image, output], rows=1, cols=3)

 ## Guess mode

-[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do it's best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).
+[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).

 Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest `DownBlock` corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the `MidBlock` output becomes 1.0.

@@ -289,9 +289,9 @@ scheduler = DPMSolverMultistepScheduler.from_pretrained(pipe_id, subfolder="sche
 3. Load an image processor:

 ```python
-from transformers import CLIPFeatureExtractor
+from transformers import CLIPImageProcessor

-feature_extractor = CLIPFeatureExtractor.from_pretrained(pipe_id, subfolder="feature_extractor")
+feature_extractor = CLIPImageProcessor.from_pretrained(pipe_id, subfolder="feature_extractor")
 ```

 <Tip warning={true}>
@@ -212,14 +212,14 @@ TCD-LoRA is very versatile, and it can be combined with other adapter types like
 import torch
 import numpy as np
 from PIL import Image
-from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from transformers import DPTImageProcessor, DPTForDepthEstimation
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
 from scheduling_tcd import TCDScheduler

 device = "cuda"
 depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
-feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")

 def get_depth_map(image):
    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
@@ -14,9 +14,9 @@ specific language governing permissions and limitations under the License.

 It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality.

-This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.
+This guide will show you how to merge LoRAs using the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.

-For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.
+For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style](https://huggingface.co/KappaNeuro/studio-ghibli-style) and [Norod78/sdxl-chalkboarddrawing-lora](https://huggingface.co/Norod78/sdxl-chalkboarddrawing-lora) LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.

 ```py
 from diffusers import DiffusionPipeline
@@ -29,7 +29,7 @@ pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_

 ## set_adapters

-The [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method merges LoRA adapters by concatenating their weighted matrices. Use the adapter name to specify which LoRAs to merge, and the `adapter_weights` parameter to control the scaling for each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, then the merged LoRA output is an average of both LoRAs. Try adjusting the adapter weights to see how it affects the generated image!
+The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRA adapters by concatenating their weighted matrices. Use the adapter name to specify which LoRAs to merge, and the `adapter_weights` parameter to control the scaling for each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, then the merged LoRA output is an average of both LoRAs. Try adjusting the adapter weights to see how it affects the generated image!

 ```py
 pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
@@ -47,19 +47,19 @@ image
 ## add_weighted_adapter

 > [!WARNING]
-> This is an experimental method that adds PEFTs [`~peft.LoraModel.add_weighted_adapter`] method to Diffusers to enable more efficient merging methods. Check out this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in learning more about the motivation and design behind this integration.
+> This is an experimental method that adds PEFTs [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method to Diffusers to enable more efficient merging methods. Check out this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in learning more about the motivation and design behind this integration.

-The [`~peft.LoraModel.add_weighted_adapter`] method provides access to more efficient merging method such as [TIES and DARE](https://huggingface.co/docs/peft/developer_guides/model_merging). To use these merging methods, make sure you have the latest stable version of Diffusers and PEFT installed.
+The [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method provides access to more efficient merging method such as [TIES and DARE](https://huggingface.co/docs/peft/developer_guides/model_merging). To use these merging methods, make sure you have the latest stable version of Diffusers and PEFT installed.

 ```bash
 pip install -U diffusers peft
 ```

-There are three steps to merge LoRAs with the [`~peft.LoraModel.add_weighted_adapter`] method:
+There are three steps to merge LoRAs with the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method:

-1. Create a [`~peft.PeftModel`] from the underlying model and LoRA checkpoint.
+1. Create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the underlying model and LoRA checkpoint.
 2. Load a base UNet model and the LoRA adapters.
-3. Merge the adapters using the [`~peft.LoraModel.add_weighted_adapter`] method and the merging method of your choice.
+3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice.

 Let's dive deeper into what these steps entail.

@@ -92,7 +92,7 @@ pipeline = DiffusionPipeline.from_pretrained(
 pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
 ```

-Now you'll create a [`~peft.PeftModel`] from the loaded LoRA checkpoint by combining the SDXL UNet and the LoRA UNet from the pipeline.
+Now you'll create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the loaded LoRA checkpoint by combining the SDXL UNet and the LoRA UNet from the pipeline.

 ```python
 from peft import get_peft_model, LoraConfig
@@ -112,7 +112,7 @@ ikea_peft_model.load_state_dict(original_state_dict, strict=True)
 > [!TIP]
 > You can optionally push the ikea_peft_model to the Hub by calling `ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)`.

-Repeat this process to create a [`~peft.PeftModel`] from the [lordjia/by-feng-zikai](https://huggingface.co/lordjia/by-feng-zikai) LoRA.
+Repeat this process to create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the [lordjia/by-feng-zikai](https://huggingface.co/lordjia/by-feng-zikai) LoRA.

 ```python
 pipeline.delete_adapters("ikea")
@@ -148,7 +148,7 @@ model = PeftModel.from_pretrained(base_unet, "stevhliu/ikea_peft_model", use_saf
 model.load_adapter("stevhliu/feng_peft_model", use_safetensors=True, subfolder="feng", adapter_name="feng")
 ```

-3. Merge the adapters using the [`~peft.LoraModel.add_weighted_adapter`] method and the merging method of your choice (learn more about other merging methods in this [blog post](https://huggingface.co/blog/peft_merging)). For this example, let's use the `"dare_linear"` method to merge the LoRAs.
+3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice (learn more about other merging methods in this [blog post](https://huggingface.co/blog/peft_merging)). For this example, let's use the `"dare_linear"` method to merge the LoRAs.

 > [!WARNING]
 > Keep in mind the LoRAs need to have the same rank to be merged!
@@ -182,9 +182,9 @@ image

 ## fuse_lora

-Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.
+Both the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.

-You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
+You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.

 For example, if you have a base model and adapters loaded and set as active with the following adapter weights:

@@ -199,7 +199,7 @@ pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_
 pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
 ```

-Fuse these LoRAs into the UNet with the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
+Fuse these LoRAs into the UNet with the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.

 ```py
 pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
@@ -226,7 +226,7 @@ image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai"
 image
 ```

-You can call [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.
+You can call [`~~loaders.lora_base.LoraBaseMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.

 ```py
 pipeline.unfuse_lora()
@@ -22,7 +22,7 @@ This guide will show you how to use PAG for various tasks and use cases.
 You can apply PAG to the [`StableDiffusionXLPipeline`] for tasks such as text-to-image, image-to-image, and inpainting. To enable PAG for a specific task, load the pipeline using the [AutoPipeline](../api/pipelines/auto_pipeline) API with the `enable_pag=True` flag and the `pag_applied_layers` argument.

 > [!TIP]
-> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!
+> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines and [`PixArtSigmaPAGPipeline`]. But feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!

 <hfoptions id="tasks">
 <hfoption id="Text-to-image">
@@ -130,10 +130,10 @@ prompt = "a dog catching a frisbee in the jungle"

 generator = torch.Generator(device="cpu").manual_seed(0)
 image = pipeline(
-    prompt, 
-    image=init_image, 
-    strength=0.8, 
-    guidance_scale=guidance_scale, 
+    prompt,
+    image=init_image,
+    strength=0.8,
+    guidance_scale=guidance_scale,
    pag_scale=pag_scale,
    generator=generator).images[0]
 ```
@@ -161,14 +161,14 @@ pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-
 pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
 ```

-This still works when your pipeline has a different task: 
+This still works when your pipeline has a different task:

 ```py
 pipeline_t2i = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
 pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_t2i, enable_pag=True)
 ```

-Let's generate an image! 
+Let's generate an image!

 ```py
 img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
@@ -258,7 +258,7 @@ for pag_scale in [0.0, 3.0]:
  </div>
 </div>

-## PAG with IP-Adapter 
+## PAG with IP-Adapter

 [IP-Adapter](https://hf.co/papers/2308.06721) is a popular model that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. You can enable PAG on a pipeline with IP-Adapter loaded.

@@ -317,7 +317,7 @@ PAG reduces artifacts and improves the overall compposition.
 </div>


-## Configure parameters 
+## Configure parameters

 ### pag_applied_layers

@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# 철학 [[philosophy]] 
+# 철학 [[philosophy]]

 🧨 Diffusers는 다양한 모달리티에서 **최신의** 사전 훈련된 diffusion 모델을 제공합니다.
 그 목적은 추론과 훈련을 위한 **모듈식 툴박스**로 사용되는 것입니다.
@@ -307,7 +307,7 @@ print(pipeline)

 위의 코드 출력 결과를 확인해보면, `pipeline`은 [`StableDiffusionPipeline`]의 인스턴스이며, 다음과 같이 총 7개의 컴포넌트로 구성된다는 것을 알 수 있습니다.

- `"feature_extractor"`: [`~transformers.CLIPFeatureExtractor`]의 인스턴스
+- `"feature_extractor"`: [`~transformers.CLIPImageProcessor`]의 인스턴스
 - `"safety_checker"`: 유해한 컨텐츠를 스크리닝하기 위한 [컴포넌트](https://github.com/huggingface/diffusers/blob/e55687e1e15407f60f32242027b7bb8170e58266/src/diffusers/pipelines/stable_diffusion/safety_checker.py#L32)
 - `"scheduler"`: [`PNDMScheduler`]의 인스턴스
 - `"text_encoder"`: [`~transformers.CLIPTextModel`]의 인스턴스
@@ -52,7 +52,7 @@ pipeline = pipeline.to("cuda")

 Text-to-image의 경우 텍스트 프롬프트를 전달합니다. 기본적으로 SDXL Turbo는 512x512 이미지를 생성하며, 이 해상도에서 최상의 결과를 제공합니다. `height` 및 `width` 매개 변수를 768x768 또는 1024x1024로 설정할 수 있지만 이 경우 품질 저하를 예상할 수 있습니다.

-모델이 `guidance_scale` 없이 학습되었으므로 이를 0.0으로 설정해 비활성화해야 합니다. 단일 추론 스텝만으로도 고품질 이미지를 생성할 수 있습니다. 
+모델이 `guidance_scale` 없이 학습되었으므로 이를 0.0으로 설정해 비활성화해야 합니다. 단일 추론 스텝만으로도 고품질 이미지를 생성할 수 있습니다.
 스텝 수를 2, 3 또는 4로 늘리면 이미지 품질이 향상됩니다.

 ```py
@@ -74,7 +74,7 @@ image

 ## Image-to-image

-Image-to-image 생성의 경우 `num_inference_steps * strength`가 1보다 크거나 같은지 확인하세요. 
+Image-to-image 생성의 경우 `num_inference_steps * strength`가 1보다 크거나 같은지 확인하세요.
 Image-to-image 파이프라인은 아래 예제에서 `0.5 * 2.0 = 1` 스텝과 같이 `int(num_inference_steps * strength)` 스텝으로 실행됩니다.

 ```py
@@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License.
 시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요:

 ```py
-!pip install -q -U diffusers transformers accelerate 
+!pip install -q -U diffusers transformers accelerate
 ```

 이 모델에는 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid)와 [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 두 가지 종류가 있습니다. SVD 체크포인트는 14개의 프레임을 생성하도록 학습되었고, SVD-XT 체크포인트는 25개의 프레임을 생성하도록 파인튜닝되었습니다.
@@ -24,7 +24,7 @@ import PIL
 from PIL import Image

 from diffusers import StableDiffusionPipeline
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer


 def image_grid(imgs, rows, cols):
@@ -71,7 +71,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -71,6 +71,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 | Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
 |   FRESCO V2V Pipeline                                                                                                    | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [FRESCO V2V Pipeline](#fresco)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 | AnimateDiff IPEX Pipeline | Accelerate AnimateDiff inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [AnimateDiff on IPEX](#animatediff-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
+| HunyuanDiT Differential Diffusion Pipeline | Applies [Differential Diffsuion](https://github.com/exx8/differential-diffusion) to [HunyuanDiT](https://github.com/huggingface/diffusers/pull/8240). | [HunyuanDiT with Differential Diffusion](#hunyuandit-with-differential-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing) | [Monjoy Choudhury](https://github.com/MnCSSJ4x) |

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.

@@ -1435,9 +1436,9 @@ import requests
 import torch
 from diffusers import DiffusionPipeline
 from PIL import Image
-from transformers import CLIPFeatureExtractor, CLIPModel
+from transformers import CLIPImageProcessor, CLIPModel

-feature_extractor = CLIPFeatureExtractor.from_pretrained(
+feature_extractor = CLIPImageProcessor.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 )
 clip_model = CLIPModel.from_pretrained(
@@ -1487,17 +1488,16 @@ NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
 ```python
 import torch
 from diffusers import DDIMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.pipelines import DiffusionPipeline

 # Use the DDIMScheduler scheduler here instead
-scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                            subfolder="scheduler")
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler")

-pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                                custom_pipeline="stable_diffusion_tensorrt_txt2img",
-                                                variant='fp16',
-                                                torch_dtype=torch.float16,
-                                                scheduler=scheduler,)
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+    custom_pipeline="stable_diffusion_tensorrt_txt2img",
+    variant='fp16',
+    torch_dtype=torch.float16,
+    scheduler=scheduler,)

 # re-use cached folder to save ONNX models and TensorRT Engines
 pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
@@ -1647,7 +1647,6 @@ from diffusers import DiffusionPipeline
 scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
                                            subfolder="scheduler")

-
 pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
                                            custom_pipeline="stable_diffusion_tensorrt_img2img",
                                            variant='fp16',
@@ -1662,7 +1661,6 @@ pipe = pipe.to("cuda")
 url = "https://pajoca.com/wp-content/uploads/2022/09/tekito-yamakawa-1.png"
 response = requests.get(url)
 input_image = Image.open(BytesIO(response.content)).convert("RGB")
-
 prompt = "photorealistic new zealand hills"
 image = pipe(prompt, image=input_image, strength=0.75,).images[0]
 image.save('tensorrt_img2img_new_zealand_hills.png')
@@ -2123,7 +2121,7 @@ import torch
 import open_clip
 from open_clip import SimpleTokenizer
 from diffusers import DiffusionPipeline
-from transformers import CLIPFeatureExtractor, CLIPModel
+from transformers import CLIPImageProcessor, CLIPModel


 def download_image(url):
@@ -2131,7 +2129,7 @@ def download_image(url):
    return PIL.Image.open(BytesIO(response.content)).convert("RGB")

 # Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
+feature_extractor = CLIPImageProcessor.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 )
 clip_model = CLIPModel.from_pretrained(
@@ -2231,12 +2229,12 @@ from io import BytesIO
 from PIL import Image
 import torch
 from diffusers import PNDMScheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+from diffusers.pipelines import DiffusionPipeline

 # Use the PNDMScheduler scheduler here instead
 scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler")

-pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting",
    custom_pipeline="stable_diffusion_tensorrt_inpaint",
    variant='fp16',
    torch_dtype=torch.float16,
@@ -4210,6 +4208,52 @@ print("Latency of AnimateDiffPipelineIpex--fp32", latency, "s for total", step,
 latency = elapsed_time(pipe4, num_inference_steps=step)
 print("Latency of AnimateDiffPipeline--fp32",latency, "s for total", step, "steps")
 ```
+### HunyuanDiT with Differential Diffusion
+
+#### Usage
+
+```python
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import load_image
+from PIL import Image
+from torchvision import transforms
+
+from pipeline_hunyuandit_differential_img2img import (
+    HunyuanDiTDifferentialImg2ImgPipeline,
+)
+
+
+pipe = HunyuanDiTDifferentialImg2ImgPipeline.from_pretrained(
+    "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
+).to("cuda")
+
+
+source_image = load_image(
+    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/20240329211129_4024911930.png"
+)
+map = load_image(
+    "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/differential/gradient_mask_2.png"
+)
+prompt = "a green pear"
+negative_prompt = "blurry"
+
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    image=source_image,
+    num_inference_steps=28,
+    guidance_scale=4.5,
+    strength=1.0,
+    map=map,
+).images[0]
+```
+
+| ![Gradient](https://github.com/user-attachments/assets/e38ce4d5-1ae6-4df0-ab43-adc1b45716b5) | ![Input](https://github.com/user-attachments/assets/9c95679c-e9d7-4f5a-90d6-560203acd6b3) | ![Output](https://github.com/user-attachments/assets/5313ff64-a0c4-418b-8b55-a38f1a5e7532) |
+| ------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| Gradient                                                                                   | Input                                                                                   | Output                                                                                   |
+
+A colab notebook demonstrating all results can be found [here](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing). Depth Maps have also been added in the same colab.

 # Perturbed-Attention Guidance

@@ -4286,4 +4330,4 @@ grid_image.save(grid_dir + "sample.png")

 `pag_scale` : guidance scale of PAG (ex: 5.0)

-`pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0'])
+`pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0'])
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn import functional as F
 from torchvision import transforms
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -86,7 +86,7 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMi
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        coca_model=None,
        coca_tokenizer=None,
        coca_transform=None,
@@ -7,7 +7,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 from torchvision import transforms
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -32,9 +32,9 @@ EXAMPLE_DOC_STRING = """
        import torch
        from diffusers import DiffusionPipeline
        from PIL import Image
-        from transformers import CLIPFeatureExtractor, CLIPModel
+        from transformers import CLIPImageProcessor, CLIPModel

-        feature_extractor = CLIPFeatureExtractor.from_pretrained(
+        feature_extractor = CLIPImageProcessor.from_pretrained(
            "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
        )
        clip_model = CLIPModel.from_pretrained(
@@ -139,7 +139,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -2436,7 +2436,7 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline):
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -43,7 +43,7 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")


 class MarigoldDepthOutput(BaseOutput):
@@ -9,7 +9,7 @@ import torch
 from numpy import exp, pi, sqrt
 from torchvision.transforms.functional import resize
 from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -275,7 +275,7 @@ class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin):
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler],
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -15,7 +15,7 @@ from diffusers.utils import logging

 try:
    from ligo.segments import segment
-    from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+    from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 except ImportError:
    raise ImportError("Please install transformers and ligo-segments to use the mixture pipeline")

@@ -144,7 +144,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, PNDMScheduler],
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
    ):
        super().__init__()
        self.register_modules(
@@ -189,7 +189,7 @@ class StableDiffusionXLControlNetAdapterPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -332,7 +332,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
@@ -1002,7 +1002,7 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -991,7 +991,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline):
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -9,7 +9,7 @@ import numpy as np
 import PIL.Image
 import torch
 from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

 # from ...configuration_utils import FrozenDict
 # from ...models import AutoencoderKL, UNet2DConditionModel
@@ -87,7 +87,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
        cc_projection ([`CCProjection`]):
            Projection layer to project the concated CLIP features and pose embeddings to the original CLIP feature size.
@@ -102,7 +102,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        cc_projection: CCProjection,
        requires_safety_checker: bool = True,
    ):
@@ -3,7 +3,7 @@ from typing import Dict, Optional

 import torch
 import torchvision.transforms.functional as FF
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers import StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
@@ -69,7 +69,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        requires_safety_checker: bool = True,
    ):
        super().__init__(
@@ -864,7 +864,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
                )

                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -1038,7 +1038,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
                        )

                        if guess_mode and do_classifier_free_guidance:
-                            # Infered ControlNet only for the conditional batch.
+                            # Inferred ControlNet only for the conditional batch.
                            # To apply the output of ControlNet to both the unconditional and conditional batches,
                            # add 0 to the unconditional batch to keep it unchanged.
                            down_block_res_samples = [
@@ -752,7 +752,7 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli
                )

                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -18,7 +18,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import intel_extension_for_pytorch as ipex
 import torch
 from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from diffusers.configuration_utils import FrozenDict
 from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
@@ -86,7 +86,7 @@ class StableDiffusionIPEXPipeline(
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -100,7 +100,7 @@ class StableDiffusionIPEXPipeline(
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -42,7 +42,7 @@ from polygraphy.backend.trt import (
    network_from_onnx_path,
    save_engine,
 )
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict, deprecate
@@ -60,7 +60,7 @@ from diffusers.utils import logging
 """
 Installation instructions
 python3 -m pip install --upgrade transformers diffusers>=0.16.0
-python3 -m pip install --upgrade tensorrt-cu12==10.2.0
+python3 -m pip install --upgrade tensorrt~=10.2.0
 python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
 python3 -m pip install onnxruntime
 """
@@ -659,7 +659,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
    r"""
    Pipeline for image-to-image generation using TensorRT accelerated Stable Diffusion.

-    This model inherits from [`StableDiffusionImg2ImgPipeline`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
@@ -679,7 +679,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -693,7 +693,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        unet: UNet2DConditionModel,
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
@@ -18,8 +18,7 @@
 import gc
 import os
 from collections import OrderedDict
-from copy import copy
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import onnx
@@ -27,9 +26,11 @@ import onnx_graphsurgeon as gs
 import PIL.Image
 import tensorrt as trt
 import torch
+from cuda import cudart
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
+from packaging import version
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
 from polygraphy.backend.onnx.loader import fold_constants
@@ -41,24 +42,29 @@ from polygraphy.backend.trt import (
    network_from_onnx_path,
    save_engine,
 )
-from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
-    StableDiffusionInpaintPipeline,
    StableDiffusionPipelineOutput,
    StableDiffusionSafetyChecker,
 )
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+    prepare_mask_and_masked_image,
+    retrieve_latents,
+)
 from diffusers.schedulers import DDIMScheduler
 from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor


 """
 Installation instructions
 python3 -m pip install --upgrade transformers diffusers>=0.16.0
-python3 -m pip install --upgrade tensorrt>=8.6.1
+python3 -m pip install --upgrade tensorrt~=10.2.0
 python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
 python3 -m pip install onnxruntime
 """
@@ -88,10 +94,6 @@ else:
 torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}


-def device_view(t):
-    return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype])
-
-
 def preprocess_image(image):
    """
    image: torch.Tensor
@@ -125,10 +127,8 @@ class Engine:
        onnx_path,
        fp16,
        input_profile=None,
-        enable_preview=False,
        enable_all_tactics=False,
        timing_cache=None,
-        workspace_size=0,
    ):
        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
        p = Profile()
@@ -137,20 +137,13 @@ class Engine:
                assert len(dims) == 3
                p.add(name, min=dims[0], opt=dims[1], max=dims[2])

-        config_kwargs = {}
-
-        config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
-        if enable_preview:
-            # Faster dynamic shapes made optional since it increases engine build time.
-            config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)
-        if workspace_size > 0:
-            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
+        extra_build_args = {}
        if not enable_all_tactics:
-            config_kwargs["tactic_sources"] = []
+            extra_build_args["tactic_sources"] = []

        engine = engine_from_network(
            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
-            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
            save_timing_cache=timing_cache,
        )
        save_engine(engine, path=self.engine_path)
@@ -163,28 +156,24 @@ class Engine:
        self.context = self.engine.create_execution_context()

    def allocate_buffers(self, shape_dict=None, device="cuda"):
-        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
-            binding = self.engine[idx]
-            if shape_dict and binding in shape_dict:
-                shape = shape_dict[binding]
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
            else:
-                shape = self.engine.get_binding_shape(binding)
-            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
-            if self.engine.binding_is_input(binding):
-                self.context.set_binding_shape(idx, shape)
+                shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
-            self.tensors[binding] = tensor
-            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+            self.tensors[name] = tensor

    def infer(self, feed_dict, stream):
-        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
-        # shallow copy of ordered dict
-        device_buffers = copy(self.buffers)
        for name, buf in feed_dict.items():
-            assert isinstance(buf, cuda.DeviceView)
-            device_buffers[name] = buf
-        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
-        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        noerror = self.context.execute_async_v3(stream)
        if not noerror:
            raise ValueError("ERROR: inference failed.")

@@ -325,10 +314,8 @@ def build_engines(
    force_engine_rebuild=False,
    static_batch=False,
    static_shape=True,
-    enable_preview=False,
    enable_all_tactics=False,
    timing_cache=None,
-    max_workspace_size=0,
 ):
    built_engines = {}
    if not os.path.isdir(onnx_dir):
@@ -393,9 +380,7 @@ def build_engines(
                    static_batch=static_batch,
                    static_shape=static_shape,
                ),
-                enable_preview=enable_preview,
                timing_cache=timing_cache,
-                workspace_size=max_workspace_size,
            )
        built_engines[model_name] = engine

@@ -674,11 +659,11 @@ def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False)
    return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)


-class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
+class TensorRTStableDiffusionInpaintPipeline(DiffusionPipeline):
    r"""
    Pipeline for inpainting using TensorRT accelerated Stable Diffusion.

-    This model inherits from [`StableDiffusionInpaintPipeline`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
@@ -698,10 +683,12 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -710,7 +697,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        unet: UNet2DConditionModel,
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
@@ -722,24 +709,86 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        onnx_dir: str = "onnx",
        # TensorRT engine build parameters
        engine_dir: str = "engine",
-        build_preview_features: bool = True,
        force_engine_rebuild: bool = False,
        timing_cache: str = "timing_cache",
    ):
-        super().__init__(
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
            image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
        )

-        self.vae.forward = self.vae.decode
-
        self.stages = stages
        self.image_height, self.image_width = image_height, image_width
        self.inpaint = True
@@ -750,7 +799,6 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        self.timing_cache = timing_cache
        self.build_static_batch = False
        self.build_dynamic_shape = False
-        self.build_preview_features = build_preview_features

        self.max_batch_size = max_batch_size
        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
@@ -761,6 +809,11 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        self.models = {}  # loaded in __loadModels()
        self.engine = {}  # loaded in build_engines()

+        self.vae.forward = self.vae.decode
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
    def __loadModels(self):
        # Load pipeline models
        self.embedding_dim = self.text_encoder.config.hidden_size
@@ -779,6 +832,112 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        if "vae_encoder" in self.stages:
            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)

+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
    @classmethod
    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
@@ -826,7 +985,6 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            force_engine_rebuild=self.force_engine_rebuild,
            static_batch=self.build_static_batch,
            static_shape=not self.build_dynamic_shape,
-            enable_preview=self.build_preview_features,
            timing_cache=self.timing_cache,
        )

@@ -850,9 +1008,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        return tuple(init_images)

    def __encode_image(self, init_image):
-        init_latents = runEngine(self.engine["vae_encoder"], {"images": device_view(init_image)}, self.stream)[
-            "latent"
-        ]
+        init_latents = runEngine(self.engine["vae_encoder"], {"images": init_image}, self.stream)["latent"]
        init_latents = 0.18215 * init_latents
        return init_latents

@@ -881,9 +1037,8 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            .to(self.torch_device)
        )

-        text_input_ids_inp = device_view(text_input_ids)
        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
-        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
            "text_embeddings"
        ].clone()

@@ -899,8 +1054,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            .input_ids.type(torch.int32)
            .to(self.torch_device)
        )
-        uncond_input_ids_inp = device_view(uncond_input_ids)
-        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
            "text_embeddings"
        ]

@@ -924,18 +1078,15 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            # Predict the noise residual
            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep

-            sample_inp = device_view(latent_model_input)
-            timestep_inp = device_view(timestep_float)
-            embeddings_inp = device_view(text_embeddings)
            noise_pred = runEngine(
                self.engine["unet"],
-                {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
                self.stream,
            )["latent"]

            # Perform guidance
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)

            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample

@@ -943,12 +1094,12 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        return latents

    def __decode_latent(self, latents):
-        images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"]
+        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
        images = (images / 2 + 0.5).clamp(0, 1)
        return images.cpu().permute(0, 2, 3, 1).float().numpy()

    def __loadResources(self, image_height, image_width, batch_size):
-        self.stream = cuda.Stream()
+        self.stream = cudart.cudaStreamCreate()[1]

        # Allocate buffers for TensorRT engine bindings
        for model_name, obj in self.models.items():
@@ -1112,5 +1263,6 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            # VAE decode latent
            images = self.__decode_latent(latents)

+        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
        images = self.numpy_to_pil(images)
-        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=None)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
@@ -18,17 +18,19 @@
 import gc
 import os
 from collections import OrderedDict
-from copy import copy
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
+import PIL.Image
 import tensorrt as trt
 import torch
+from cuda import cudart
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
+from packaging import version
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
 from polygraphy.backend.onnx.loader import fold_constants
@@ -40,23 +42,25 @@ from polygraphy.backend.trt import (
    network_from_onnx_path,
    save_engine,
 )
-from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict, deprecate
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipeline,
    StableDiffusionPipelineOutput,
    StableDiffusionSafetyChecker,
 )
 from diffusers.schedulers import DDIMScheduler
 from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor


 """
 Installation instructions
 python3 -m pip install --upgrade transformers diffusers>=0.16.0
-python3 -m pip install --upgrade tensorrt>=8.6.1
+python3 -m pip install --upgrade tensorrt~=10.2.0
 python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
 python3 -m pip install onnxruntime
 """
@@ -86,10 +90,6 @@ else:
 torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}


-def device_view(t):
-    return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype])
-
-
 class Engine:
    def __init__(self, engine_path):
        self.engine_path = engine_path
@@ -110,10 +110,8 @@ class Engine:
        onnx_path,
        fp16,
        input_profile=None,
-        enable_preview=False,
        enable_all_tactics=False,
        timing_cache=None,
-        workspace_size=0,
    ):
        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
        p = Profile()
@@ -122,20 +120,13 @@ class Engine:
                assert len(dims) == 3
                p.add(name, min=dims[0], opt=dims[1], max=dims[2])

-        config_kwargs = {}
-
-        config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
-        if enable_preview:
-            # Faster dynamic shapes made optional since it increases engine build time.
-            config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)
-        if workspace_size > 0:
-            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
+        extra_build_args = {}
        if not enable_all_tactics:
-            config_kwargs["tactic_sources"] = []
+            extra_build_args["tactic_sources"] = []

        engine = engine_from_network(
            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
-            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
            save_timing_cache=timing_cache,
        )
        save_engine(engine, path=self.engine_path)
@@ -148,28 +139,24 @@ class Engine:
        self.context = self.engine.create_execution_context()

    def allocate_buffers(self, shape_dict=None, device="cuda"):
-        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
-            binding = self.engine[idx]
-            if shape_dict and binding in shape_dict:
-                shape = shape_dict[binding]
+        for binding in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(binding)
+            if shape_dict and name in shape_dict:
+                shape = shape_dict[name]
            else:
-                shape = self.engine.get_binding_shape(binding)
-            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
-            if self.engine.binding_is_input(binding):
-                self.context.set_binding_shape(idx, shape)
+                shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                self.context.set_input_shape(name, shape)
            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
-            self.tensors[binding] = tensor
-            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+            self.tensors[name] = tensor

    def infer(self, feed_dict, stream):
-        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
-        # shallow copy of ordered dict
-        device_buffers = copy(self.buffers)
        for name, buf in feed_dict.items():
-            assert isinstance(buf, cuda.DeviceView)
-            device_buffers[name] = buf
-        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
-        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        noerror = self.context.execute_async_v3(stream)
        if not noerror:
            raise ValueError("ERROR: inference failed.")

@@ -310,10 +297,8 @@ def build_engines(
    force_engine_rebuild=False,
    static_batch=False,
    static_shape=True,
-    enable_preview=False,
    enable_all_tactics=False,
    timing_cache=None,
-    max_workspace_size=0,
 ):
    built_engines = {}
    if not os.path.isdir(onnx_dir):
@@ -378,9 +363,7 @@ def build_engines(
                    static_batch=static_batch,
                    static_shape=static_shape,
                ),
-                enable_preview=enable_preview,
                timing_cache=timing_cache,
-                workspace_size=max_workspace_size,
            )
        built_engines[model_name] = engine

@@ -588,11 +571,11 @@ def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
    return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)


-class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
+class TensorRTStableDiffusionPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-image generation using TensorRT accelerated Stable Diffusion.

-    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
@@ -612,10 +595,12 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

+    _optional_components = ["safety_checker", "feature_extractor"]
+
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -624,7 +609,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        unet: UNet2DConditionModel,
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae"],
@@ -632,28 +617,90 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        image_width: int = 768,
        max_batch_size: int = 16,
        # ONNX export parameters
-        onnx_opset: int = 17,
+        onnx_opset: int = 18,
        onnx_dir: str = "onnx",
        # TensorRT engine build parameters
        engine_dir: str = "engine",
-        build_preview_features: bool = True,
        force_engine_rebuild: bool = False,
        timing_cache: str = "timing_cache",
    ):
-        super().__init__(
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
            image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
        )

-        self.vae.forward = self.vae.decode
-
        self.stages = stages
        self.image_height, self.image_width = image_height, image_width
        self.inpaint = False
@@ -664,7 +711,6 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        self.timing_cache = timing_cache
        self.build_static_batch = False
        self.build_dynamic_shape = False
-        self.build_preview_features = build_preview_features

        self.max_batch_size = max_batch_size
        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
@@ -675,6 +721,11 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        self.models = {}  # loaded in __loadModels()
        self.engine = {}  # loaded in build_engines()

+        self.vae.forward = self.vae.decode
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
    def __loadModels(self):
        # Load pipeline models
        self.embedding_dim = self.text_encoder.config.hidden_size
@@ -691,6 +742,75 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        if "vae" in self.stages:
            self.models["vae"] = make_VAE(self.vae, **models_args)

+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Union[torch.Generator, List[torch.Generator]],
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the latent vectors for diffusion.
+        Args:
+            batch_size (int): The number of samples in the batch.
+            num_channels_latents (int): The number of channels in the latent vectors.
+            height (int): The height of the latent vectors.
+            width (int): The width of the latent vectors.
+            dtype (torch.dtype): The data type of the latent vectors.
+            device (torch.device): The device to place the latent vectors on.
+            generator (Union[torch.Generator, List[torch.Generator]]): The generator(s) to use for random number generation.
+            latents (Optional[torch.Tensor]): The pre-existing latent vectors. If None, new latent vectors will be generated.
+        Returns:
+            torch.Tensor: The prepared latent vectors.
+        """
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(
+        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
+    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
+        r"""
+        Runs the safety checker on the given image.
+        Args:
+            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
+            device (torch.device): The device to run the safety checker on.
+            dtype (torch.dtype): The data type of the input image.
+        Returns:
+            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
+            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
+        """
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
    @classmethod
    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
@@ -738,7 +858,6 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            force_engine_rebuild=self.force_engine_rebuild,
            static_batch=self.build_static_batch,
            static_shape=not self.build_dynamic_shape,
-            enable_preview=self.build_preview_features,
            timing_cache=self.timing_cache,
        )

@@ -769,9 +888,8 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            .to(self.torch_device)
        )

-        text_input_ids_inp = device_view(text_input_ids)
        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
-        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
            "text_embeddings"
        ].clone()

@@ -787,8 +905,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            .input_ids.type(torch.int32)
            .to(self.torch_device)
        )
-        uncond_input_ids_inp = device_view(uncond_input_ids)
-        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
            "text_embeddings"
        ]

@@ -812,18 +929,15 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            # Predict the noise residual
            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep

-            sample_inp = device_view(latent_model_input)
-            timestep_inp = device_view(timestep_float)
-            embeddings_inp = device_view(text_embeddings)
            noise_pred = runEngine(
                self.engine["unet"],
-                {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
                self.stream,
            )["latent"]

            # Perform guidance
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)

            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample

@@ -831,12 +945,12 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        return latents

    def __decode_latent(self, latents):
-        images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"]
+        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
        images = (images / 2 + 0.5).clamp(0, 1)
        return images.cpu().permute(0, 2, 3, 1).float().numpy()

    def __loadResources(self, image_height, image_width, batch_size):
-        self.stream = cuda.Stream()
+        self.stream = cudart.cudaStreamCreate()[1]

        # Allocate buffers for TensorRT engine bindings
        for model_name, obj in self.models.items():
@@ -73,7 +73,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -66,7 +66,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = logging.getLogger(__name__)

@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -0,0 +1,195 @@
+# DreamBooth training example for FLUX.1 [dev]
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
+
+The `train_dreambooth_flux.py` script shows how to implement the training procedure and adapt it for [FLUX.1 [dev]](https://blackforestlabs.ai/announcing-black-forest-labs/). We also provide a LoRA implementation in the `train_dreambooth_lora_flux.py` script.
+> [!NOTE]
+> **Memory consumption**
+>
+> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
+> a LoRA with a rank of 16 (w/ all components trained) can exceed 40GB of VRAM for training.
+> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
+
+
+> [!NOTE]
+> **Gated model**
+>
+> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+
+```bash
+huggingface-cli login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/dreambooth` folder and run
+```bash
+pip install -r requirements_flux.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
+
+Now, we can launch training using:
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux"
+
+accelerate launch train_dreambooth_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+> [!NOTE]
+> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
+
+> [!TIP]
+> You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.
+
+## LoRA + DreamBooth
+
+[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+To perform DreamBooth with LoRA, run:
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux-lora"
+
+accelerate launch train_dreambooth_lora_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+### Text Encoder Training
+
+Alongside the transformer, fine-tuning of the CLIP text encoder is also supported.
+To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
+
+> [!NOTE]
+> FLUX.1 has 2 text encoders (CLIP L/14 and T5-v1.1-XXL).
+By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
+> At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
+
+To perform DreamBooth LoRA with text-encoder training, run:
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+export OUTPUT_DIR="trained-flux-dev-dreambooth-lora"
+
+accelerate launch train_dreambooth_lora_flux.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --mixed_precision="bf16" \
+  --train_text_encoder\
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --learning_rate=1e-5 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --seed="0" \
+  --push_to_hub
+```
+
+## Other notes
+Thanks to `bghira` for their help with reviewing & insight sharing ♥️
@@ -148,12 +148,12 @@ accelerate launch train_dreambooth_lora_sd3.py \
 ```

 ### Text Encoder Training
-Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported. 
+Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported.
 To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:

 > [!NOTE]
-> SD3 has three text encoders (CLIP L/14, OpenCLIP bigG/14, and T5-v1.1-XXL). 
-By enabling `--train_text_encoder`, LoRA fine-tuning of both **CLIP encoders** is performed. At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled. 
+> SD3 has three text encoders (CLIP L/14, OpenCLIP bigG/14, and T5-v1.1-XXL).
+By enabling `--train_text_encoder`, LoRA fine-tuning of both **CLIP encoders** is performed. At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.

 To perform DreamBooth LoRA with text-encoder training, run:
 ```bash
@@ -185,4 +185,4 @@ accelerate launch train_dreambooth_lora_sd3.py \

 1. We default to the "logit_normal" weighting scheme for the loss following the SD3 paper. Thanks to @bghira for helping us discover that for other weighting schemes supported from the training script, training may incur numerical instabilities.
 2. Thanks to `bghira`, `JinxuXiang`, and `bendanzzc` for helping us discover a bug in how VAE encoding was being done previously. This has been fixed in [#8917](https://github.com/huggingface/diffusers/pull/8917).
-3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well. 
+3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well.
@@ -0,0 +1,8 @@
+accelerate>=0.31.0
+torchvision
+transformers>=4.41.2
+ftfy
+tensorboard
+Jinja2
+peft>=0.11.1
+sentencepiece
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, FluxTransformer2DModel
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothFlux(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_flux.py"
+
+    def test_dreambooth(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "transformer", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_dreambooth_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check can run the original fully trained output pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+
+            # check can run an intermediate checkpoint
+            transformer = FluxTransformer2DModel.from_pretrained(tmpdir, subfolder="checkpoint-2/transformer")
+            pipe = DiffusionPipeline.from_pretrained(self.pretrained_model_name_or_path, transformer=transformer)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 7 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir)
+            pipe(self.instance_prompt, num_inference_steps=1)
+
+            # check old checkpoints do not exist
+            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
+
+            # check new checkpoints exist
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
+            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "photo"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
+    script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
+
+    def test_dreambooth_lora_flux(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_text_encoder_flux(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --train_text_encoder
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            starts_with_expected_prefix = all(
+                (key.startswith("transformer") or key.startswith("text_encoder")) for key in lora_state_dict.keys()
+            )
+            self.assertTrue(starts_with_expected_prefix)
+
+    def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -1271,7 +1271,7 @@ def main(args):
        lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)

        transformer_state_dict = {
-            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
+            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
        }
        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
@@ -1454,7 +1454,7 @@ def main(args):
            )

    # Clear the memory here
-    if not args.train_text_encoder and train_dataset.custom_instance_prompts:
+    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
        del tokenizers, text_encoders
        # Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
        del text_encoder_one, text_encoder_two, text_encoder_three
@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -64,7 +64,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -57,7 +57,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -43,7 +43,7 @@ from PIL import Image
 from torch.utils.data import default_collate
 from torchvision import transforms
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, DPTFeatureExtractor, DPTForDepthEstimation, PretrainedConfig
+from transformers import AutoTokenizer, DPTForDepthEstimation, DPTImageProcessor, PretrainedConfig
 from webdataset.tariterators import (
    base_plus_ext,
    tar_file_expander,
@@ -205,7 +205,7 @@ class Text2ImageDataset:
        pin_memory: bool = False,
        persistent_workers: bool = False,
        control_type: str = "canny",
-        feature_extractor: Optional[DPTFeatureExtractor] = None,
+        feature_extractor: Optional[DPTImageProcessor] = None,
    ):
        if not isinstance(train_shards_path_or_url, str):
            train_shards_path_or_url = [list(braceexpand(urls)) for urls in train_shards_path_or_url]
@@ -1011,7 +1011,7 @@ def main(args):
        controlnet = pre_controlnet

    if args.control_type == "depth":
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+        feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
        depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas")
        depth_model.requires_grad_(False)
    else:
@@ -45,7 +45,7 @@
    "    UniPCMultistepScheduler,\n",
    "    EulerDiscreteScheduler,\n",
    ")\n",
-    "from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer\n",
+    "from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer\n",
    "# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'\n",
    "\n",
    "pretrained_model_name_or_path = '/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83'\n",
@@ -46,5 +46,4 @@ pipe.enable_model_cpu_offload()
 # generate image
 generator = torch.manual_seed(0)
 image = pipe("a tortoise", num_inference_steps=20, generator=generator, image_pair=[image_a,image_b], image=query).images[0]
-
 ```
@@ -2051,7 +2051,7 @@ if __name__ == "__main__":
        default=512,
        type=int,
        help=(
-            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2"
            " Base. Use 768 for Stable Diffusion v2."
        ),
    )
@@ -1253,7 +1253,7 @@ class PromptDiffusionPipeline(
                )

                if guess_mode and self.do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
+                    # Inferred ControlNet only for the conditional batch.
                    # To apply the output of ControlNet to both the unconditional and conditional batches,
                    # add 0 to the unconditional batch to keep it unchanged.
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union
 import torch
 from PIL import Image
 from retriever import Retriever, normalize_images, preprocess_images
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -47,7 +47,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        feature_extractor ([`CLIPFeatureExtractor`]):
+        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

@@ -65,7 +65,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
        ],
-        feature_extractor: CLIPFeatureExtractor,
+        feature_extractor: CLIPImageProcessor,
        retriever: Optional[Retriever] = None,
    ):
        super().__init__()
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 from datasets import Dataset, load_dataset
 from PIL import Image
-from transformers import CLIPFeatureExtractor, CLIPModel, PretrainedConfig
+from transformers import CLIPImageProcessor, CLIPModel, PretrainedConfig

 from diffusers import logging

@@ -20,7 +20,7 @@ def normalize_images(images: List[Image.Image]):
    return images


-def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.Tensor:
+def preprocess_images(images: List[np.array], feature_extractor: CLIPImageProcessor) -> torch.Tensor:
    """
    Preprocesses a list of images into a batch of tensors.

@@ -95,14 +95,12 @@ class Index:
    def build_index(
        self,
        model=None,
-        feature_extractor: CLIPFeatureExtractor = None,
+        feature_extractor: CLIPImageProcessor = None,
        torch_dtype=torch.float32,
    ):
        if not self.index_initialized:
            model = model or CLIPModel.from_pretrained(self.config.clip_name_or_path).to(dtype=torch_dtype)
-            feature_extractor = feature_extractor or CLIPFeatureExtractor.from_pretrained(
-                self.config.clip_name_or_path
-            )
+            feature_extractor = feature_extractor or CLIPImageProcessor.from_pretrained(self.config.clip_name_or_path)
            self.dataset = get_dataset_with_emb_from_clip_model(
                self.dataset,
                model,
@@ -136,7 +134,7 @@ class Retriever:
        index: Index = None,
        dataset: Dataset = None,
        model=None,
-        feature_extractor: CLIPFeatureExtractor = None,
+        feature_extractor: CLIPImageProcessor = None,
    ):
        self.config = config
        self.index = index or self._build_index(config, dataset, model=model, feature_extractor=feature_extractor)
@@ -148,7 +146,7 @@ class Retriever:
        index: Index = None,
        dataset: Dataset = None,
        model=None,
-        feature_extractor: CLIPFeatureExtractor = None,
+        feature_extractor: CLIPImageProcessor = None,
        **kwargs,
    ):
        config = kwargs.pop("config", None) or IndexConfig.from_pretrained(retriever_name_or_path, **kwargs)
@@ -156,7 +154,7 @@ class Retriever:

    @staticmethod
    def _build_index(
-        config: IndexConfig, dataset: Dataset = None, model=None, feature_extractor: CLIPFeatureExtractor = None
+        config: IndexConfig, dataset: Dataset = None, model=None, feature_extractor: CLIPImageProcessor = None
    ):
        dataset = dataset or load_dataset(config.dataset_name)
        dataset = dataset[config.dataset_set]
--- a/Show More
+++ b/Show More