up

2023-01-03 16:49:38 +01:00 · 2023-01-03 16:46:13 +01:00
239 changed files with 14878 additions and 4822 deletions
@@ -1,4 +1,4 @@
-name: Nightly integration tests
+name: Nightly tests on main

 on:
  schedule:
@@ -9,12 +9,108 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 1000
+  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
+  RUN_NIGHTLY: yes

 jobs:
-  run_slow_tests_apple_m1:
-    name: Slow PyTorch MPS tests on MacOS
+  run_nightly_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Nightly PyTorch CUDA tests on Ubuntu
+            framework: pytorch
+            runner: docker-gpu
+            image: diffusers/diffusers-pytorch-cuda
+            report: torch_cuda
+          - name: Nightly Flax TPU tests on Ubuntu
+            framework: flax
+            runner: docker-tpu
+            image: diffusers/diffusers-flax-tpu
+            report: flax_tpu
+          - name: Nightly ONNXRuntime CUDA tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-gpu
+            image: diffusers/diffusers-onnxruntime-cuda
+            report: onnx_cuda
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        if: ${{ matrix.config.runner == 'docker-gpu' }}
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -e .[quality,test]
+          python -m pip install -U git+https://github.com/huggingface/transformers
+          python -m pip install git+https://github.com/huggingface/accelerate
+
+      - name: Environment
+        run: |
+          python utils/print_env.py
+
+      - name: Run nightly PyTorch CUDA tests
+        if: ${{ matrix.config.framework == 'pytorch' }}
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "not Flax and not Onnx" \
+            --make-reports=tests_${{ matrix.config.report }} \
+            tests/
+
+      - name: Run nightly Flax TPU tests
+        if: ${{ matrix.config.framework == 'flax' }}
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          python -m pytest -n 0 \
+            -s -v -k "Flax" \
+            --make-reports=tests_${{ matrix.config.report }} \
+            tests/
+
+      - name: Run nightly ONNXRuntime CUDA tests
+        if: ${{ matrix.config.framework == 'onnxruntime' }}
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -s -v -k "Onnx" \
+            --make-reports=tests_${{ matrix.config.report }} \
+            tests/
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ matrix.config.report }}_test_reports
+          path: reports
+
+  run_nightly_tests_apple_m1:
+    name: Nightly PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]

    steps:
@@ -46,7 +142,7 @@ jobs:
        run: |
          ${CONDA_RUN} python utils/print_env.py

-      - name: Run slow PyTorch tests on M1 (MPS)
+      - name: Run nightly PyTorch tests on M1 (MPS)
        shell: arch -arch arm64 bash {0}
        env:
          HF_HOME: /System/Volumes/Data/mnt/cache
@@ -63,4 +159,4 @@ jobs:
        uses: actions/upload-artifact@v2
        with:
          name: torch_mps_test_reports
-          path: reports
+          path: reports
@@ -1,4 +1,4 @@
-name: Run fast tests
+name: Fast tests for PRs

 on:
  pull_request:
@@ -59,8 +59,8 @@ jobs:
      run: |
        apt-get update && apt-get install libsndfile1-dev -y
        python -m pip install -e .[quality,test]
-        python -m pip install git+https://github.com/huggingface/accelerate
        python -m pip install -U git+https://github.com/huggingface/transformers
+        python -m pip install git+https://github.com/huggingface/accelerate

    - name: Environment
      run: |
@@ -1,4 +1,4 @@
-name: Run all tests
+name: Slow tests on main

 on:
  push:
@@ -10,7 +10,7 @@ env:
  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 1000
+  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes

 jobs:
@@ -61,8 +61,8 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install -e .[quality,test]
-        python -m pip install git+https://github.com/huggingface/accelerate
        python -m pip install -U git+https://github.com/huggingface/transformers
+        python -m pip install git+https://github.com/huggingface/accelerate

    - name: Environment
      run: |
@@ -153,4 +153,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: examples_test_reports
-        path: reports
+        path: reports
@@ -166,3 +166,6 @@ tags
 .DS_Store
 # RL pipelines may produce mp4 outputs
 *.mp4
+
+# dependencies
+/transformers
@@ -235,6 +235,55 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```

+Diffusers also has a Image-to-Image generation pipeline with Flax/Jax
+```python
+import jax
+import numpy as np
+import jax.numpy as jnp
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+import requests
+from io import BytesIO
+from PIL import Image
+from diffusers import FlaxStableDiffusionImg2ImgPipeline
+
+def create_key(seed=0):
+    return jax.random.PRNGKey(seed)
+rng = create_key(0)
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+init_img = Image.open(BytesIO(response.content)).convert("RGB")
+init_img = init_img.resize((768, 512))
+
+prompts = "A fantasy landscape, trending on artstation"
+
+pipeline, params = FlaxStableDiffusionImg2ImgPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", revision="flax",
+    dtype=jnp.bfloat16,
+)
+
+num_samples = jax.device_count()
+rng = jax.random.split(rng, jax.device_count())
+prompt_ids, processed_image = pipeline.prepare_inputs(prompt=[prompts]*num_samples, image = [init_img]*num_samples)
+p_params = replicate(params)
+prompt_ids = shard(prompt_ids)
+processed_image = shard(processed_image)
+
+output = pipeline(
+    prompt_ids=prompt_ids, 
+    image=processed_image, 
+    params=p_params, 
+    prng_seed=rng, 
+    strength=0.75, 
+    num_inference_steps=50, 
+    jit=True, 
+    height=512,
+    width=768).images
+
+output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+```
+
 ### Image-to-Image text-guided generation with Stable Diffusion

 The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
@@ -302,11 +351,8 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]

 ### Tweak prompts reusing seeds and latents

-You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
-
-
-For more details, check out [the Stable Diffusion notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
-and have a look into the [release notes](https://github.com/huggingface/diffusers/releases/tag/v0.2.0).
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked.
+Please have a look at [Reusing seeds for deterministic generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/reusing_seeds).

 ## Fine-Tuning Stable Diffusion

@@ -0,0 +1,271 @@
+<!---
+Copyright 2022- The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 
+you can install them with the following command, at the root of the code repository:
+
+```bash
+pip install -e ".[docs]"
+```
+
+Then you need to install our open source documentation builder tool:
+
+```bash
+pip install git+https://github.com/huggingface/doc-builder
+```
+
+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look before committing for instance). You don't have to commit the built documentation.
+
+---
+
+## Previewing the documentation
+
+To preview the docs, first install the `watchdog` module with:
+
+```bash
+pip install watchdog
+```
+
+Then run the following command:
+
+```bash
+doc-builder preview {package_name} {path_to_docs}
+```
+
+For example:
+
+```bash
+doc-builder preview diffusers docs/source/
+```
+
+The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
+
+---
+**NOTE**
+
+The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
+
+---
+
+## Adding a new element to the navigation bar
+
+Accepted files are Markdown (.md or .mdx).
+
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/diffusers/blob/main/docs/source/_toctree.yml) file.
+
+## Renaming section headers and moving sections
+
+It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+
+Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+
+```
+Sections that were moved:
+
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course, if you moved it to another file, then:
+
+```
+Sections that were moved:
+
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+
+Use the relative style to link to the new file so that the versioned docs continue to work.
+
+For an example of a rich moved section set please see the very end of [the transformers Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.mdx).
+
+
+## Writing Documentation - Specification
+
+The `huggingface/diffusers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
+although we can write them directly in Markdown.
+
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
+
+- Add a new file under `docs/source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `docs/source/_toctree.yml` on the correct toc-tree.
+
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or four.
+
+### Adding a new pipeline/scheduler
+
+When adding a new pipeline:
+
+- create a file `xxx.mdx` under `docs/source/api/pipelines` (don't hesitate to copy an existing file as template).
+- Link that file in (*Diffusers Summary*) section in `docs/source/api/pipelines/overview.mdx`, along with the link to the paper, and a colab notebook (if available).
+- Write a short overview of the diffusion model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+    - Possible an end-to-end example of how to use it
+- Add all the pipeline classes that should be linked in the diffusion model. These classes should be added using our Markdown syntax. By default as follows:
+
+```
+## XXXPipeline
+
+[[autodoc]] XXXPipeline
+    - all
+	- __call__
+```
+
+This will include every public method of the pipeline that is documented, as well as the  `__call__` method that is not documented by default. If you just want to add additional methods that are not documented, you can put the list of all methods to add in a list that contains `all`.
+
+```
+[[autodoc]] XXXPipeline
+    - all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+    - enable_xformers_memory_efficient_attention 
+    - disable_xformers_memory_efficient_attention
+```
+
+You can follow the same process to create a new scheduler under the `docs/source/api/schedulers` folder
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None, or any strings should usually be put in `code`.
+
+When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or 
+function to be in the main package.
+
+If you want to create a link to some internal class or function, you need to
+provide its path. For instance: \[\`pipelines.ImagePipelineOutput\`\]. This will be converted into a link with
+`pipelines.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~pipelines.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description.
+
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
+description:
+
+```
+    Args:
+        n_layers (`int`): The number of layers of the model.
+```
+
+If the description is too long to fit in one line, another indentation is necessary before writing the description
+after the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+```
+
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+
+```
+def my_function(x: str = None, a: float = 1):
+```
+
+then its documentation should look like this:
+
+```
+    Args:
+        x (`str`, *optional*):
+            This argument controls ...
+        a (`float`, *optional*, defaults to 1):
+            This argument is used to ...
+```
+
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`).
+
+#### Writing a multi-line code block
+
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+
+
+````
+```
+# first line of code
+# second line
+# etc
+```
+````
+
+#### Writing a return block
+
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example of a single value return:
+
+```
+    Returns:
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
+
+Here's an example of a tuple return, comprising several objects:
+
+```
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
+          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+#### Adding an image
+
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
+
+## Styling the docstring
+
+We have an automatic script running with the `make style` command that will make sure that:
+- the docstrings fully take advantage of the line width
+- all code examples are formatted using black, like the code of the Transformers library
+
+This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
+recommended to commit your changes before running `make style`, so you can revert the changes done by that script
+easily.
+
@@ -28,6 +28,8 @@
      title: "Text-Guided Image-Inpainting"
    - local: using-diffusers/depth2img
      title: "Text-Guided Depth-to-Image"
+    - local: using-diffusers/reusing_seeds
+      title: "Reusing seeds for deterministic generation"
    - local: using-diffusers/custom_pipeline_examples
      title: "Community Pipelines"
    - local: using-diffusers/contribute_pipeline
@@ -45,6 +47,8 @@
 - sections:
  - local: optimization/fp16
    title: "Memory and Speed"
+  - local: optimization/xformers
+    title: "xFormers"
  - local: optimization/onnx
    title: "ONNX"
  - local: optimization/open_vino
@@ -78,8 +82,6 @@
  - sections:
    - local: api/models
      title: "Models"
-    - local: api/schedulers
-      title: "Schedulers"
    - local: api/diffusion_pipeline
      title: "Diffusion Pipeline"
    - local: api/logging
@@ -89,6 +91,7 @@
    - local: api/outputs
      title: "Outputs"
    title: "Main Classes"
+    
  - sections:
    - local: api/pipelines/overview
      title: "Overview"
@@ -110,7 +113,21 @@
      title: "PNDM"
    - local: api/pipelines/score_sde_ve
      title: "Score SDE VE"
-    - local: api/pipelines/stable_diffusion
+    - sections:
+      - local: api/pipelines/stable_diffusion/overview
+        title: "Overview"
+      - local: api/pipelines/stable_diffusion/text2img
+        title: "Text-to-Image"
+      - local: api/pipelines/stable_diffusion/img2img
+        title: "Image-to-Image"
+      - local: api/pipelines/stable_diffusion/inpaint
+        title: "Inpaint"
+      - local: api/pipelines/stable_diffusion/depth2img
+        title: "Depth-to-Image"
+      - local: api/pipelines/stable_diffusion/image_variation
+        title: "Image-Variation"  
+      - local: api/pipelines/stable_diffusion/upscale
+        title: "Super-Resolution"  
      title: "Stable Diffusion"
    - local: api/pipelines/stable_diffusion_2
      title: "Stable Diffusion 2"
@@ -120,6 +137,8 @@
      title: "Stochastic Karras VE"
    - local: api/pipelines/dance_diffusion
      title: "Dance Diffusion"
+    - local: api/pipelines/unclip
+      title: "UnCLIP"
    - local: api/pipelines/versatile_diffusion
      title: "Versatile Diffusion"
    - local: api/pipelines/vq_diffusion
@@ -129,6 +148,44 @@
    - local: api/pipelines/audio_diffusion
      title: "Audio Diffusion"
    title: "Pipelines"
+  - sections:
+    - local: api/schedulers/overview
+      title: "Overview"
+    - local: api/schedulers/ddim
+      title: "DDIM"
+    - local: api/schedulers/ddpm
+      title: "DDPM"
+    - local: api/schedulers/singlestep_dpm_solver
+      title: "Singlestep DPM-Solver"
+    - local: api/schedulers/multistep_dpm_solver
+      title: "Multistep DPM-Solver"
+    - local: api/schedulers/heun
+      title: "Heun Scheduler"
+    - local: api/schedulers/dpm_discrete
+      title: "DPM Discrete Scheduler"
+    - local: api/schedulers/dpm_discrete_ancestral
+      title: "DPM Discrete Scheduler with ancestral sampling"
+    - local: api/schedulers/stochastic_karras_ve
+      title: "Stochastic Kerras VE"
+    - local: api/schedulers/lms_discrete
+      title: "Linear Multistep"
+    - local: api/schedulers/pndm
+      title: "PNDM"
+    - local: api/schedulers/score_sde_ve
+      title: "VE-SDE"
+    - local: api/schedulers/ipndm
+      title: "IPNDM"
+    - local: api/schedulers/score_sde_vp
+      title: "VP-SDE"
+    - local: api/schedulers/euler
+      title: "Euler scheduler"
+    - local: api/schedulers/euler_ancestral
+      title: "Euler Ancestral Scheduler"
+    - local: api/schedulers/vq_diffusion
+      title: "VQDiffusionScheduler"
+    - local: api/schedulers/repaint
+      title: "RePaint Scheduler"      
+    title: "Schedulers"
  - sections:
    - local: api/experimental/rl
      title: "RL Planning"
@@ -30,13 +30,17 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain

 ## DiffusionPipeline
 [[autodoc]] DiffusionPipeline
-	- from_pretrained
-	- save_pretrained
-	- to
+	- all
+	- __call__
 	- device
-	- components
+	- to

 ## ImagePipelineOutput
 By default diffusion pipelines return an object of class

-[[autodoc]] pipeline_utils.ImagePipelineOutput
+[[autodoc]] pipelines.ImagePipelineOutput
+
+## AudioPipelineOutput
+By default diffusion pipelines return an object of class
+
+[[autodoc]] pipelines.AudioPipelineOutput
@@ -41,13 +41,13 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 [[autodoc]] models.vae.DecoderOutput

 ## VQEncoderOutput
-[[autodoc]] models.vae.VQEncoderOutput
+[[autodoc]] models.vq_model.VQEncoderOutput

 ## VQModel
 [[autodoc]] VQModel

 ## AutoencoderKLOutput
-[[autodoc]] models.vae.AutoencoderKLOutput
+[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput

 ## AutoencoderKL
 [[autodoc]] AutoencoderKL
@@ -56,7 +56,13 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 [[autodoc]] Transformer2DModel

 ## Transformer2DModelOutput
-[[autodoc]] models.attention.Transformer2DModelOutput
+[[autodoc]] models.transformer_2d.Transformer2DModelOutput
+
+## PriorTransformer
+[[autodoc]] models.prior_transformer.PriorTransformer
+
+## PriorTransformerOutput
+[[autodoc]] models.prior_transformer.PriorTransformerOutput

 ## FlaxModelMixin
 [[autodoc]] FlaxModelMixin
@@ -25,7 +25,7 @@ pipeline = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32")
 outputs = pipeline()
 ```

-The `outputs` object is a [`~pipeline_utils.ImagePipelineOutput`], as we can see in the
+The `outputs` object is a [`~pipelines.ImagePipelineOutput`], as we can see in the
 documentation of that class below, it means it has an image attribute.

 You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`:
@@ -28,7 +28,7 @@ The abstract of the paper is the following:

 ## Tips

- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion).
+- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion/overview).

 - *Run AltDiffusion*

@@ -69,15 +69,15 @@ If you want to use all possible use cases in a single `DiffusionPipeline` we rec

 ## AltDiffusionPipelineOutput
 [[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
+	- all
+	- __call__

 ## AltDiffusionPipeline
 [[autodoc]] AltDiffusionPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## AltDiffusionImg2ImgPipeline
 [[autodoc]] AltDiffusionImg2ImgPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
@@ -91,12 +91,8 @@ display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))

 ## AudioDiffusionPipeline
 [[autodoc]] AudioDiffusionPipeline
-    - __call__
-    - encode
-    - slerp
-
+	- all
+	- __call__

 ## Mel
 [[autodoc]] Mel
-    - audio_slice_to_image
-    - image_to_audio
@@ -96,4 +96,5 @@ image.save("black_to_blue.png")

 ## CycleDiffusionPipeline
 [[autodoc]] CycleDiffusionPipeline
+	- all
 	- __call__
@@ -30,4 +30,5 @@ The original codebase of this implementation can be found [here](https://github.

 ## DanceDiffusionPipeline
 [[autodoc]] DanceDiffusionPipeline
-    - __call__
+	- all
+	- __call__
@@ -32,4 +32,5 @@ For questions, feel free to contact the author on [tsong.me](https://tsong.me/).

 ## DDIMPipeline
 [[autodoc]] DDIMPipeline
-    - __call__
+	- all
+	- __call__
@@ -33,4 +33,5 @@ The original codebase of this paper can be found [here](https://github.com/hojon

 # DDPMPipeline
 [[autodoc]] DDPMPipeline
-    - __call__
+	- all
+	- __call__
@@ -40,8 +40,10 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff

 ## LDMTextToImagePipeline
 [[autodoc]] LDMTextToImagePipeline
-    - __call__
+	- all
+	- __call__

 ## LDMSuperResolutionPipeline
 [[autodoc]] LDMSuperResolutionPipeline
-    - __call__
+	- all
+	- __call__
@@ -38,4 +38,5 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff

 ## LDMPipeline
 [[autodoc]] LDMPipeline
-    - __call__
+	- all
+	- __call__
@@ -44,31 +44,32 @@ available a colab notebook to directly try them out.

 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
-| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
-| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
-| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
-| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
-| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
-| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
-| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
-| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | 
-| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
-| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
-| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
-| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
-| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
-| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
-| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
+| [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
+| [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
+| [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
+| [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
+| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [paint_by_example](./paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | 
+| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
+| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
+| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
+| [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
+| [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
+| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
+| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
+| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
+| [vq_diffusion](./vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 


 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
@@ -138,9 +139,9 @@ from diffusers import StableDiffusionImg2ImgPipeline

 # load the pipeline
 device = "cuda"
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
-).to(device)
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
+    device
+)

 # let's download an initial image
 url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
@@ -188,7 +189,6 @@ mask_image = download_image(mask_url).resize((512, 512))

 pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
-    revision="fp16",
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -69,5 +69,6 @@ image
 ```

 ## PaintByExamplePipeline
-[[autodoc]] pipelines.paint_by_example.pipeline_paint_by_example.PaintByExamplePipeline
-    - __call__
+[[autodoc]] PaintByExamplePipeline
+	- all
+	- __call__
@@ -30,6 +30,6 @@ The original codebase can be found [here](https://github.com/luping-liu/PNDM).


 ## PNDMPipeline
-[[autodoc]] pipelines.pndm.pipeline_pndm.PNDMPipeline
-    - __call__
-
+[[autodoc]] PNDMPipeline
+	- all
+	- __call__
@@ -72,6 +72,6 @@ inpainted_image = output.images[0]
 ```

 ## RePaintPipeline
-[[autodoc]] pipelines.repaint.pipeline_repaint.RePaintPipeline
-    - __call__
-
+[[autodoc]] RePaintPipeline
+	- all
+	- __call__
@@ -32,5 +32,5 @@ This pipeline implements the Variance Expanding (VE) variant of the method.

 ## ScoreSdeVePipeline
 [[autodoc]] ScoreSdeVePipeline
-    - __call__
-
+	- all
+	- __call__
@@ -0,0 +1,33 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Depth-to-Image Generation
+
+## StableDiffusionDepth2ImgPipeline
+
+The depth-guided stable diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. It uses [MiDas](https://github.com/isl-org/MiDaS) to infer depth based on an image.
+
+[`StableDiffusionDepth2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images as well as a `depth_map` to preserve the images’ structure. 
+
+The original codebase can be found here: 
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion)
+
+Available Checkpoints are:
+- *stable-diffusion-2-depth*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth)
+
+[[autodoc]] StableDiffusionDepth2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -0,0 +1,31 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Image Variation
+
+## StableDiffusionImageVariationPipeline
+
+[`StableDiffusionImageVariationPipeline`] lets you generate variations from an input image using Stable Diffusion. It uses a fine-tuned version of Stable Diffusion model, trained by  [Justin Pinkney](https://www.justinpinkney.com/) (@Buntworthy) at [Lambda](https://lambdalabs.com/)
+
+The original codebase can be found here:
+[Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations)
+
+Available Checkpoints are:
+- *sd-image-variations-diffusers*: [lambdalabs/sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers)
+
+[[autodoc]] StableDiffusionImageVariationPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -0,0 +1,29 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Image-to-Image Generation
+
+## StableDiffusionImg2ImgPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionImg2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images using Stable Diffusion.
+
+The original codebase can be found here: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion/blob/main/scripts/img2img.py) 
+
+[`StableDiffusionImg2ImgPipeline`] is compatible with all Stable Diffusion checkpoints for [Text-to-Image](./text2img) 
+
+[[autodoc]] StableDiffusionImg2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -0,0 +1,33 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-Guided Image Inpainting
+
+## StableDiffusionInpaintPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and a text prompt using Stable Diffusion.
+
+The original codebase can be found here: 
+- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/runwayml/stable-diffusion#inpainting-with-stable-diffusion)
+- *Stable Diffusion V2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#image-inpainting-with-stable-diffusion)
+
+Available checkpoints are:
+- *stable-diffusion-inpainting (512x512 resolution)*: [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)
+- *stable-diffusion-2-inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting)
+
+[[autodoc]] StableDiffusionInpaintPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -25,9 +25,14 @@ For more details about how Stable Diffusion works and how it differs from the ba

 | Pipeline | Tasks | Colab | Demo
 |---|---|:---:|:---:|
-| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion)
-| [pipeline_stable_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
-| [pipeline_stable_diffusion_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
+| [StableDiffusionPipeline](./text2img) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion)
+| [StableDiffusionImg2ImgPipeline](./img2img) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
+| [StableDiffusionInpaintPipeline](./inpaint) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
+| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon
+| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations)
+| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon
+
+

 ## Tips

@@ -73,16 +78,18 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca

 ## StableDiffusionPipeline
 [[autodoc]] StableDiffusionPipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention

+
+
 ## StableDiffusionImg2ImgPipeline
 [[autodoc]] StableDiffusionImg2ImgPipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
@@ -91,6 +98,7 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca

 ## StableDiffusionInpaintPipeline
 [[autodoc]] StableDiffusionInpaintPipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
@@ -99,6 +107,7 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca

 ## StableDiffusionDepth2ImgPipeline
 [[autodoc]] StableDiffusionDepth2ImgPipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
@@ -107,15 +116,16 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca

 ## StableDiffusionImageVariationPipeline
 [[autodoc]] StableDiffusionImageVariationPipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention

-
 ## StableDiffusionUpscalePipeline
 [[autodoc]] StableDiffusionUpscalePipeline
+	- all
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
@@ -0,0 +1,39 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-to-Image Generation
+
+## StableDiffusionPipeline
+
+The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion.
+
+The original codebase can be found here: 
+- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion)
+
+Available Checkpoints are:
+- *stable-diffusion-v1-4 (512x512 resolution)* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)
+- *stable-diffusion-v1-5 (512x512 resolution)* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+- *stable-diffusion-2-base (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base)
+- *stable-diffusion-2 (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 
+- *stable-diffusion-2-1-base (512x512 resolution)* [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)
+- *stable-diffusion-2-1 (768x768 resolution)*: [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
+
+[[autodoc]] StableDiffusionPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -0,0 +1,32 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Super-Resolution
+
+## StableDiffusionUpscalePipeline
+
+The upscaler diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. [`StableDiffusionUpscalePipeline`] can be used to enhance the resolution of input images by a factor of 4.
+
+The original codebase can be found here: 
+- *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#image-upscaling-with-stable-diffusion)
+
+Available Checkpoints are:
+- *stabilityai/stable-diffusion-x4-upscaler (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler)
+
+
+[[autodoc]] StableDiffusionUpscalePipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
@@ -24,17 +24,20 @@ For more details about how Stable Diffusion 2 works and how it differs from Stab

 ### Available checkpoints:

-Note that the architecture is more or less identical to [Stable Diffusion 1](./api/pipelines/stable_diffusion) so please refer to [this page](./api/pipelines/stable_diffusion) for API documentation.
+Note that the architecture is more or less identical to [Stable Diffusion 1](./stable_diffusion/overview) so please refer to [this page](./stable_diffusion/overview) for API documentation.

 - *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]
 - *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]
 - *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]
- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+- *Super-Resolution (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
 - *Depth-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth) with [`StableDiffusionDepth2ImagePipeline`]

 We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest scheduler there is.

- *Text-to-Image (512x512 resolution)*:
+
+### Text-to-Image
+
+- *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]

 ```python
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
@@ -51,7 +54,7 @@ image = pipe(prompt, num_inference_steps=25).images[0]
 image.save("astronaut.png")
 ```

- *Text-to-Image (768x768 resolution)*:
+- *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]

 ```python
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
@@ -68,7 +71,9 @@ image = pipe(prompt, guidance_scale=9, num_inference_steps=25).images[0]
 image.save("astronaut.png")
 ```

- *Image Inpainting (512x512 resolution)*:
+### Image Inpainting
+
+- *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]

 ```python
 import PIL
@@ -102,7 +107,10 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inferen
 image.save("yellow_cat.png")
 ```

- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+### Super-Resolution
+
+- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) with [`StableDiffusionUpscalePipeline`]
+

 ```python
 import requests
@@ -113,7 +121,7 @@ import torch

 # load model and scheduler
 model_id = "stabilityai/stable-diffusion-x4-upscaler"
-pipeline = StableDiffusionUpscalePipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
+pipeline = StableDiffusionUpscalePipeline.from_pretrained(model_id, torch_dtype=torch.float16)
 pipeline = pipeline.to("cuda")

 # let's download an  image
@@ -126,16 +134,10 @@ upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
 upscaled_image.save("upsampled_cat.png")
 ```

+### Depth-to-Image
+
 - *Depth-Guided Text-to-Image*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth) [`StableDiffusionDepth2ImagePipeline`]

-**Installation**
-
-```bash
-!pip install -U git+https://github.com/huggingface/transformers.git
-!pip install diffusers[torch]
-```
-
-**Example**

 ```python
 import torch
@@ -28,7 +28,7 @@ The abstract of the paper is the following:

 ## Tips

- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion).
+- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion/text2img).

 ### Run Safe Stable Diffusion

@@ -81,10 +81,10 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro

 ## StableDiffusionSafePipelineOutput
 [[autodoc]] pipelines.stable_diffusion_safe.StableDiffusionSafePipelineOutput
+	- all
+	- __call__

 ## StableDiffusionPipelineSafe
 [[autodoc]] StableDiffusionPipelineSafe
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-
@@ -32,4 +32,5 @@ This pipeline implements the Stochastic sampling tailored to the Variance-Expand

 ## KarrasVePipeline
 [[autodoc]] KarrasVePipeline
-    - __call__
+	- all
+	- __call__
@@ -0,0 +1,37 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# unCLIP
+
+## Overview
+
+[Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) by Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, Mark Chen
+
+The abstract of the paper is the following:
+
+Contrastive models like CLIP have been shown to learn robust representations of images that capture both semantics and style. To leverage these representations for image generation, we propose a two-stage model: a prior that generates a CLIP image embedding given a text caption, and a decoder that generates an image conditioned on the image embedding. We show that explicitly generating image representations improves image diversity with minimal loss in photorealism and caption similarity. Our decoders conditioned on image representations can also produce variations of an image that preserve both its semantics and style, while varying the non-essential details absent from the image representation. Moreover, the joint embedding space of CLIP enables language-guided image manipulations in a zero-shot fashion. We use diffusion models for the decoder and experiment with both autoregressive and diffusion models for the prior, finding that the latter are computationally more efficient and produce higher-quality samples.
+
+The unCLIP model in diffusers comes from kakaobrain's karlo and the original codebase can be found [here](https://github.com/kakaobrain/karlo). Additionally, lucidrains has a DALL-E 2 recreation [here](https://github.com/lucidrains/DALLE2-pytorch).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_unclip.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/unclip/pipeline_unclip.py) | *Text-to-Image Generation* | - |
+| [pipeline_unclip_image_variation.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py) | *Image-Guided Image Generation* | - |
+
+
+## UnCLIPPipeline
+[[autodoc]] UnCLIPPipeline
+	- all
+	- __call__
+
+[[autodoc]] UnCLIPImageVariationPipeline
+	- all
+	- __call__
@@ -20,7 +20,7 @@ The abstract of the paper is the following:

 ## Tips

- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.
+- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion/overview),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.

 ### *Run VersatileDiffusion*

@@ -56,18 +56,15 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro

 ## VersatileDiffusionTextToImagePipeline
 [[autodoc]] VersatileDiffusionTextToImagePipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## VersatileDiffusionImageVariationPipeline
 [[autodoc]] VersatileDiffusionImageVariationPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing

 ## VersatileDiffusionDualGuidedPipeline
 [[autodoc]] VersatileDiffusionDualGuidedPipeline
+	- all
 	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
@@ -30,5 +30,6 @@ The original codebase can be found [here](https://github.com/microsoft/VQ-Diffus


 ## VQDiffusionPipeline
-[[autodoc]] pipelines.vq_diffusion.pipeline_vq_diffusion.VQDiffusionPipeline
-    - __call__
+[[autodoc]] VQDiffusionPipeline
+	- all
+	- __call__
@@ -1,183 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Schedulers
-
-Diffusers contains multiple pre-built schedule functions for the diffusion process.
-
-## What is a scheduler?
-
-The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample. That's why schedulers may also be called *Samplers* in other diffusion models implementations.
-
- Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
-    - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.
-    - for inference, the scheduler defines how to update a sample based on an output from a pretrained model.
- Schedulers are often defined by a *noise schedule* and an *update rule* to solve the differential equation solution.
-
-### Discrete versus continuous schedulers
-
-All schedulers take in a timestep to predict the updated version of the sample being diffused.
-The timesteps dictate where in the diffusion process the step is, where data is generated by iterating forward in time and inference is executed by propagating backwards through timesteps.
-Different algorithms use timesteps that both discrete (accepting `int` inputs), such as the [`DDPMScheduler`] or [`PNDMScheduler`], and continuous (accepting `float` inputs), such as the score-based schedulers [`ScoreSdeVeScheduler`] or [`ScoreSdeVpScheduler`].
-
-## Designing Re-usable schedulers
-
-The core design principle between the schedule functions is to be model, system, and framework independent.
-This allows for rapid experimentation and cleaner abstractions in the code, where the model prediction is separated from the sample update.
-To this end, the design of schedulers is such that:
-
- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
-
-
-## API
-
-The core API for any new scheduler must follow a limited structure.
- Schedulers should provide one or more `def step(...)` functions that should be called to update the generated sample iteratively.
- Schedulers should provide a `set_timesteps(...)` method that configures the parameters of a schedule function for a specific inference task.
- Schedulers should be framework-specific.
-
-The base class [`SchedulerMixin`] implements low level utilities used by multiple schedulers.
-
-### SchedulerMixin
-[[autodoc]] SchedulerMixin
-
-### SchedulerOutput
-The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...)` call.
-
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
-
-### Implemented Schedulers
-
-#### Denoising diffusion implicit models (DDIM)
-
-Original paper can be found here.
-
-[[autodoc]] DDIMScheduler
-
-#### Denoising diffusion probabilistic models (DDPM)
-
-Original paper can be found [here](https://arxiv.org/abs/2010.02502).
-
-[[autodoc]] DDPMScheduler
-
-#### Singlestep DPM-Solver
-
-Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
-
-[[autodoc]] DPMSolverSinglestepScheduler
-
-#### Multistep DPM-Solver
-
-Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
-
-[[autodoc]] DPMSolverMultistepScheduler
-
-#### Heun scheduler inspired by Karras et. al paper
-
-Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] HeunDiscreteScheduler
-
-#### DPM Discrete Scheduler inspired by Karras et. al paper
-
-Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] KDPM2DiscreteScheduler
-
-#### DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper
-
-Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
-Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
-
-All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
-
-[[autodoc]] KDPM2AncestralDiscreteScheduler
-
-#### Variance exploding, stochastic sampling from Karras et. al
-
-Original paper can be found [here](https://arxiv.org/abs/2006.11239).
-
-[[autodoc]] KarrasVeScheduler
-
-#### Linear multistep scheduler for discrete beta schedules
-
-Original implementation can be found [here](https://arxiv.org/abs/2206.00364).
-
-[[autodoc]] LMSDiscreteScheduler
-
-#### Pseudo numerical methods for diffusion models (PNDM)
-
-Original implementation can be found [here](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
-
-[[autodoc]] PNDMScheduler
-
-#### variance exploding stochastic differential equation (VE-SDE) scheduler
-
-Original paper can be found [here](https://arxiv.org/abs/2011.13456).
-
-[[autodoc]] ScoreSdeVeScheduler
-
-#### improved pseudo numerical methods for diffusion models (iPNDM)
-
-Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
-
-[[autodoc]] IPNDMScheduler
-
-#### variance preserving stochastic differential equation (VP-SDE) scheduler
-
-Original paper can be found [here](https://arxiv.org/abs/2011.13456).
-
-<Tip warning={true}>
-
-Score SDE-VP is under construction.
-
-</Tip>
-
-[[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
-
-#### Euler scheduler
-
-Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
-Fast scheduler which often times generates good outputs with 20-30 steps.
-
-[[autodoc]] EulerDiscreteScheduler
-
-
-#### Euler Ancestral scheduler
-
-Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
-Fast scheduler which often times generates good outputs with 20-30 steps.
-
-[[autodoc]] EulerAncestralDiscreteScheduler
-
-
-#### VQDiffusionScheduler
-
-Original paper can be found [here](https://arxiv.org/abs/2111.14822)
-
-[[autodoc]] VQDiffusionScheduler
-
-#### RePaint scheduler
-
-DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
-Intended for use with [`RePaintPipeline`].
-Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
-and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
-
-[[autodoc]] RePaintScheduler
@@ -0,0 +1,27 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Denoising diffusion implicit models (DDIM)
+
+## Overview
+
+[Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
+
+The abstract of the paper is the following:
+
+Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.
+
+The original codebase of this paper can be found here: [ermongroup/ddim](https://github.com/ermongroup/ddim).
+For questions, feel free to contact the author on [tsong.me](https://tsong.me/).
+
+## DDIMScheduler
+[[autodoc]] DDIMScheduler
@@ -0,0 +1,27 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Denoising diffusion probabilistic models (DDPM)
+
+## Overview
+
+[Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) 
+ (DDPM) by Jonathan Ho, Ajay Jain and Pieter Abbeel proposes the diffusion based model of the same name, but in the context of the 🤗 Diffusers library, DDPM refers to the discrete denoising scheduler from the paper as well as the pipeline.
+
+The abstract of the paper is the following:
+
+We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.
+
+The original paper can be found [here](https://arxiv.org/abs/2010.02502).
+
+## DDPMScheduler
+[[autodoc]] DDPMScheduler
@@ -0,0 +1,22 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DPM Discrete Scheduler inspired by Karras et. al paper
+
+## Overview
+
+Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
+
+## KDPM2DiscreteScheduler
+[[autodoc]] KDPM2DiscreteScheduler
@@ -0,0 +1,22 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper
+
+## Overview
+
+Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
+
+## KDPM2AncestralDiscreteScheduler
+[[autodoc]] KDPM2AncestralDiscreteScheduler
@@ -0,0 +1,21 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Euler scheduler
+
+## Overview
+
+Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.
+
+## EulerDiscreteScheduler
+[[autodoc]] EulerDiscreteScheduler
@@ -0,0 +1,21 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Euler Ancestral scheduler
+
+## Overview
+
+Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.
+
+## EulerAncestralDiscreteScheduler
+[[autodoc]] EulerAncestralDiscreteScheduler
@@ -0,0 +1,23 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Heun scheduler inspired by Karras et. al paper
+
+## Overview
+
+Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
+Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
+
+All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
+
+## HeunDiscreteScheduler
+[[autodoc]] HeunDiscreteScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# improved pseudo numerical methods for diffusion models (iPNDM)
+
+## Overview
+
+Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
+
+## IPNDMScheduler
+[[autodoc]] IPNDMScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Linear multistep scheduler for discrete beta schedules
+
+## Overview
+
+Original implementation can be found [here](https://arxiv.org/abs/2206.00364).
+
+## LMSDiscreteScheduler
+[[autodoc]] LMSDiscreteScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Multistep DPM-Solver
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
+
+## DPMSolverMultistepScheduler
+[[autodoc]] DPMSolverMultistepScheduler
@@ -0,0 +1,83 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Schedulers
+
+Diffusers contains multiple pre-built schedule functions for the diffusion process.
+
+## What is a scheduler?
+
+The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample. That's why schedulers may also be called *Samplers* in other diffusion models implementations.
+
+- Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
+    - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.
+    - for inference, the scheduler defines how to update a sample based on an output from a pretrained model.
+- Schedulers are often defined by a *noise schedule* and an *update rule* to solve the differential equation solution.
+
+### Discrete versus continuous schedulers
+
+All schedulers take in a timestep to predict the updated version of the sample being diffused.
+The timesteps dictate where in the diffusion process the step is, where data is generated by iterating forward in time and inference is executed by propagating backwards through timesteps.
+Different algorithms use timesteps that can be discrete (accepting `int` inputs), such as the [`DDPMScheduler`] or [`PNDMScheduler`], or continuous (accepting `float` inputs), such as the score-based schedulers [`ScoreSdeVeScheduler`] or [`ScoreSdeVpScheduler`].
+
+## Designing Re-usable schedulers
+
+The core design principle between the schedule functions is to be model, system, and framework independent.
+This allows for rapid experimentation and cleaner abstractions in the code, where the model prediction is separated from the sample update.
+To this end, the design of schedulers is such that:
+
+- Schedulers can be used interchangeably between diffusion models in inference to find the preferred trade-off between speed and generation quality.
+- Schedulers are currently by default in PyTorch, but are designed to be framework independent (partial Jax support currently exists).
+
+## Schedulers Summary
+
+The following table summarizes all officially supported schedulers, their corresponding paper
+
+
+| Scheduler | Paper |
+|---|---|
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) |
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) |
+| [singlestep_dpm_solver](./singlestep_dpm_solver) | [**Singlestep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
+| [multistep_dpm_solver](./multistep_dpm_solver) | [**Multistep DPM-Solver**](https://arxiv.org/abs/2206.00927) |
+| [heun](./heun) | [**Heun scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [dpm_discrete](./dpm_discrete) | [**DPM Discrete Scheduler inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [dpm_discrete_ancestral](./dpm_discrete_ancestral) | [**DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper**](https://arxiv.org/abs/2206.00364) |
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Variance exploding, stochastic sampling from Karras et. al**](https://arxiv.org/abs/2206.00364) |
+| [lms_discrete](./lms_discrete) | [**Linear multistep scheduler for discrete beta schedules**](https://arxiv.org/abs/2206.00364) |
+| [pndm](./pndm) | [**Pseudo numerical methods for diffusion models (PNDM)**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181) |
+| [score_sde_ve](./score_sde_ve) | [**variance exploding stochastic differential equation (VE-SDE) scheduler**](https://arxiv.org/abs/2011.13456) |
+| [ipndm](./ipndm) | [**improved pseudo numerical methods for diffusion models (iPNDM)**](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296) |
+| [score_sde_vp](./score_sde_vp) | [**Variance preserving stochastic differential equation (VP-SDE) scheduler**](https://arxiv.org/abs/2011.13456) |
+| [euler](./euler) | [**Euler scheduler**](https://arxiv.org/abs/2206.00364) |
+| [euler_ancestral](./euler_ancestral) | [**Euler Ancestral scheduler**](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) |
+| [vq_diffusion](./vq_diffusion) | [**VQDiffusionScheduler**](https://arxiv.org/abs/2111.14822) |
+| [repaint](./repaint) | [**RePaint scheduler**](https://arxiv.org/abs/2201.09865) |
+
+## API
+
+The core API for any new scheduler must follow a limited structure.
+- Schedulers should provide one or more `def step(...)` functions that should be called to update the generated sample iteratively.
+- Schedulers should provide a `set_timesteps(...)` method that configures the parameters of a schedule function for a specific inference task.
+- Schedulers should be framework-specific.
+
+The base class [`SchedulerMixin`] implements low level utilities used by multiple schedulers.
+
+### SchedulerMixin
+[[autodoc]] SchedulerMixin
+
+### SchedulerOutput
+The class [`SchedulerOutput`] contains the outputs from any schedulers `step(...)` call.
+
+[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
+
+
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pseudo numerical methods for diffusion models (PNDM)
+
+## Overview
+
+Original implementation can be found [here](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
+
+## PNDMScheduler
+[[autodoc]] PNDMScheduler
@@ -0,0 +1,23 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RePaint scheduler
+
+## Overview
+
+DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
+Intended for use with [`RePaintPipeline`].
+Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
+and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
+
+## RePaintScheduler
+[[autodoc]] RePaintScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# variance exploding stochastic differential equation (VE-SDE) scheduler
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).
+
+## ScoreSdeVeScheduler
+[[autodoc]] ScoreSdeVeScheduler
@@ -0,0 +1,26 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Variance preserving stochastic differential equation (VP-SDE) scheduler
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2011.13456).
+
+<Tip warning={true}>
+
+Score SDE-VP is under construction.
+
+</Tip>
+
+## ScoreSdeVpScheduler
+[[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Singlestep DPM-Solver
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
+
+## DPMSolverSinglestepScheduler
+[[autodoc]] DPMSolverSinglestepScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Variance exploding, stochastic sampling from Karras et. al
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2206.00364).
+
+## KarrasVeScheduler
+[[autodoc]] KarrasVeScheduler
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VQDiffusionScheduler
+
+## Overview
+
+Original paper can be found [here](https://arxiv.org/abs/2111.14822)
+
+## VQDiffusionScheduler
+[[autodoc]] VQDiffusionScheduler
@@ -23,7 +23,7 @@ specific language governing permissions and limitations under the License.
 More precisely, 🤗 Diffusers offers:

 - State-of-the-art diffusion pipelines that can be run in inference with just a couple of lines of code (see [**Using Diffusers**](./using-diffusers/conditional_image_generation)) or have a look at [**Pipelines**](#pipelines) to get an overview of all supported pipelines and their corresponding papers.
- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers).
+- Various noise schedulers that can be used interchangeably for the preferred speed vs. quality trade-off in inference. For more information see [**Schedulers**](./api/schedulers/overview).
 - Multiple types of models, such as UNet, can be used as building blocks in an end-to-end diffusion system. See [**Models**](./api/models) for more details 
 - Training examples to show how to train the most popular diffusion model tasks. For more information see [**Training**](./training/overview).

@@ -47,14 +47,15 @@ available a colab notebook to directly try them out.
 | [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
 | [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
 | [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
 | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
 | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
 | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
@@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License.

 # Memory and speed

-We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed.
+We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed. As a general rule, we recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for memory efficient attention, please see the recommended [installation instructions](xformers).
+
+We'll discuss how the following settings impact performance and memory.

 |                  | Latency | Speedup |
 | ---------------- | ------- | ------- |
@@ -77,7 +79,7 @@ To save more GPU memory and get even more speed, you can load and run the model
 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -105,7 +107,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -132,7 +134,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -157,7 +159,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -177,7 +179,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -232,7 +234,6 @@ def generate_inputs():

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
    torch_dtype=torch.float16,
 ).to("cuda")
 unet = pipe.unet
@@ -296,7 +297,6 @@ class UNet2DConditionOutput:

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
    torch_dtype=torch.float16,
 ).to("cuda")

@@ -322,7 +322,9 @@ with torch.inference_mode():


 ## Memory Efficient Attention
-Recent work on optimizing the bandwitdh in the attention block have generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention (from @tridao, [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf)) .
+
+Recent work on optimizing the bandwitdh in the attention block has generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention from @tridao: [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf).
+
 Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):

 | GPU              	| Base Attention FP16 	| Memory Efficient Attention FP16 	|
@@ -338,14 +340,13 @@ Here are the speedups we obtain on a few Nvidia GPUs when running the inference
 To leverage it just make sure you have: 
 - PyTorch > 1.12
 - Cuda available
- - Installed the [xformers](https://github.com/facebookresearch/xformers) library
+ - [Installed the xformers library](xformers).
 ```python
 from diffusers import StableDiffusionPipeline
 import torch

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-    revision="fp16",
    torch_dtype=torch.float16,
 ).to("cuda")

@@ -0,0 +1,26 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Installing xFormers
+
+We recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for both inference and training. In our tests, the optimizations performed in the attention blocks allow for both faster speed and reduced memory consumption.
+
+Installing xFormers has historically been a bit involved, as binary distributions were not always up to date. Fortunately, the project has [very recently](https://github.com/facebookresearch/xformers/pull/591) integrated a process to build pip wheels as part of the project's continuous integration, so this should improve a lot starting from xFormers version 0.0.16.
+
+Until xFormers 0.0.16 is deployed, you can install pip wheels using [`TestPyPI`](https://test.pypi.org/project/formers/). These are the steps that worked for us in a Linux computer to install xFormers version 0.0.15:
+
+```bash
+pip install pyre-extensions==0.0.23
+pip install -i https://test.pypi.org/simple/ formers==0.0.15.dev376
+```
+
+We'll update these instructions when the wheels are published to the official PyPI repository.
@@ -22,7 +22,7 @@ pip install --upgrade diffusers accelerate transformers
 ```

 - [`accelerate`](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training
- [`transformers`](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion)
+- [`transformers`](https://huggingface.co/docs/transformers/index) is required to run the most popular diffusion models, such as [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview)

 ## DiffusionPipeline

@@ -97,7 +97,7 @@ Running the pipeline is then identical to the code above as it's the same model
 >>> image.save("image_of_squirrel_painting.png")
 ```

-Diffusion systems can be used with multiple different [schedulers](./api/schedulers) each with their
+Diffusion systems can be used with multiple different [schedulers](./api/schedulers/overview) each with their
 pros and cons. By default, Stable Diffusion runs with [`PNDMScheduler`], but it's very simple to 
 use a different scheduler. *E.g.* if you would instead like to use the [`EulerDiscreteScheduler`] scheduler,
 you could use it as follows:
@@ -21,8 +21,6 @@ The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/m

 <Tip warning={true}>

-<!-- TODO: replace with our blog when it's done -->
-
 Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects, and go from there.

 </Tip>
@@ -38,23 +36,17 @@ pip install git+https://github.com/huggingface/diffusers
 pip install -U -r diffusers/examples/dreambooth/requirements.txt
 ```

-Then initialize and configure a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+xFormers is not part of the training requirements, but [we recommend you install it if you can](../optimization/xformers). It could make your training faster and less memory intensive.
+
+After all dependencies have been set up you can configure a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:

 ```bash
 accelerate config
 ```

-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+In this example we'll use model version `v1-4`, so please visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4) and carefully read the license before proceeding. 

-You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
-
-Run the following command to authenticate your token
-
-```bash
-huggingface-cli login
-```
-
-If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
+The command below will download and cache the model weights from the Hub because we use the model's Hub id `CompVis/stable-diffusion-v1-4`. You may also clone the repo locally and use the local path in your system where the checkout was saved.

 ### Dog toy example

@@ -111,6 +103,59 @@ accelerate launch train_dreambooth.py \
  --max_train_steps=800
 ```

+### Saving checkpoints while training
+
+It's easy to overfit while training with Dreambooth, so sometimes it's useful to save regular checkpoints during the process. One of the intermediate checkpoints might work better than the final model! To use this feature you need to pass the following argument to the training script:
+
+```bash
+  --checkpointing_steps=500
+```
+
+This will save the full training state in subfolders of your `output_dir`. Subfolder names begin with the prefix `checkpoint-`, and then the number of steps performed so far; for example: `checkpoint-1500` would be a checkpoint saved after 1500 training steps.
+
+#### Resuming training from a saved checkpoint
+
+If you want to resume training from any of the saved checkpoints, you can pass the argument `--resume_from_checkpoint` and then indicate the name of the checkpoint you want to use. You can also use the special string `"latest"` to resume from the last checkpoint saved (i.e., the one with the largest number of steps). For example, the following would resume training from the checkpoint saved after 1500 steps:
+
+```bash
+  --resume_from_checkpoint="checkpoint-1500"
+```
+
+This would be a good opportunity to tweak some of your hyperparameters if you wish.
+
+#### Performing inference using a saved checkpoint
+
+Saved checkpoints are stored in a format suitable for resuming training. They not only include the model weights, but also the state of the optimizer, data loaders and learning rate.
+
+You can use a checkpoint for inference, but first you need to convert it to an inference pipeline. This is how you could do it:
+
+```python
+from accelerate import Accelerator
+from diffusers import DiffusionPipeline
+
+# Load the pipeline with the same arguments (model, revision) that were used for training
+model_id = "CompVis/stable-diffusion-v1-4"
+pipeline = DiffusionPipeline.from_pretrained(model_id)
+
+accelerator = Accelerator()
+
+# Use text_encoder if `--train_text_encoder` was used for the initial training
+unet, text_encoder = accelerator.prepare(pipeline.unet, pipeline.text_encoder)
+
+# Restore state from a checkpoint path. You have to use the absolute path here.
+accelerator.load_state("/sddata/dreambooth/daruma-v2-1/checkpoint-100")
+
+# Rebuild the pipeline with the unwrapped models (assignment to .unet and .text_encoder should work too)
+pipeline = DiffusionPipeline.from_pretrained(
+    model_id,
+    unet=accelerator.unwrap_model(unet),
+    text_encoder=accelerator.unwrap_model(text_encoder),
+)
+
+# Perform inference, or save, or push to the hub
+pipeline.save_pretrained("dreambooth-pipeline")
+```
+
 ### Training on a 16GB GPU

 With the help of gradient checkpointing and the 8-bit optimizer from [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), it's possible to train dreambooth on a 16GB GPU.
@@ -238,3 +283,5 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]

 image.save("dog-bucket.png")
 ```
+
+You may also run inference from [any of the saved training checkpoints](#performing-inference-using-a-saved-checkpoint).
@@ -38,6 +38,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 - [Text Inversion](./text_inversion)
 - [Dreambooth](./dreambooth)

+If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive.

 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
@@ -58,7 +58,6 @@ guided_pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="clip_guided_stable_diffusion",
    clip_model=clip_model,
    feature_extractor=feature_extractor,
-    revision="fp16",
    torch_dtype=torch.float16,
 )
 guided_pipeline.enable_attention_slicing()
@@ -113,7 +112,6 @@ import torch

 pipe = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
-    revision="fp16",
    torch_dtype=torch.float16,
    safety_checker=None,  # Very important for videos...lots of false positives while interpolating
    custom_pipeline="interpolate_stable_diffusion",
@@ -159,7 +157,6 @@ pipe = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    custom_pipeline="stable_diffusion_mega",
    torch_dtype=torch.float16,
-    revision="fp16",
 )
 pipe.to("cuda")
 pipe.enable_attention_slicing()
@@ -204,7 +201,7 @@ from diffusers import DiffusionPipeline
 import torch

 pipe = DiffusionPipeline.from_pretrained(
-    "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", revision="fp16", torch_dtype=torch.float16
+    "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", torch_dtype=torch.float16
 )
 pipe = pipe.to("cuda")

@@ -268,7 +265,7 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="speech_to_image_diffusion",
    speech_model=model,
    speech_processor=processor,
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )

@@ -24,9 +24,9 @@ from diffusers import StableDiffusionImg2ImgPipeline

 # load the pipeline
 device = "cuda"
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
-).to(device)
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
+    device
+)

 # let's download an initial image
 url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
@@ -42,7 +42,6 @@ mask_image = download_image(mask_url).resize((512, 512))

 pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
-    revision="fp16",
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Re-using seeds for fast prompt engineering
+
+A common use case when generating images is to generate a batch of images, select one image and improve it with a better, more detailed prompt in a second run.
+To do this, one needs to make each generated image of the batch deterministic.
+Images are generated by denoising gaussian random noise which can be instantiated by passing a [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator).
+
+Now, for batched generation, we need to make sure that every single generated image in the batch is tied exactly to one seed. In 🧨 Diffusers, this can be achieved by not passing one `generator`, but a list 
+of `generators` to the pipeline.
+
+Let's go through an example using [`runwayml/stable-diffusion-v1-5`](runwayml/stable-diffusion-v1-5).
+We want to generate several versions of the prompt:
+
+```py
+prompt = "Labrador in the style of Vermeer"
+```
+
+Let's load the pipeline
+
+```python
+>>> from diffusers import DiffusionPipeline
+
+>>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+>>> pipe = pipe.to("cuda")
+```
+
+Now, let's define 4 different generators, since we would like to reproduce a certain image. We'll use seeds `0` to `3` to create our generators.
+
+```python
+>>> import torch
+
+>>> generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
+```
+
+Let's generate 4 images:
+
+```python
+>>> images = pipe(prompt, generator=generator, num_images_per_prompt=4).images
+>>> images
+```
+
+![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg)
+
+Ok, the last images has some double eyes, but the first image looks good! 
+Let's try to make the prompt a bit better **while keeping the first seed** 
+so that the images are similar to the first image.
+
+```python
+prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
+generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
+```
+
+We create 4 generators with seed `0`, which is the first seed we used before.
+
+Let's run the pipeline again.
+
+```python
+>>> images = pipe(prompt, generator=generator).images
+>>> images
+```
+
+![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg)
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 # Schedulers

 Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize 
-a pipeline to one's use case. The best example of this are the [Schedulers](../api/schedulers.mdx).
+a pipeline to one's use case. The best example of this are the [Schedulers](../api/schedulers/overview.mdx).

 Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample, 
 schedulers define the whole denoising process, *i.e.*:
@@ -52,6 +52,10 @@ For such examples, we are more lenient regarding the philosophy defined above an
 Examples that are useful for the community, but are either not yet deemed popular or not yet following our above philosophy should go into the [community examples](https://github.com/huggingface/diffusers/tree/main/examples/community) folder. The community folder therefore includes training examples and inference pipelines.
 **Note**: Community examples can be a [great first contribution](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) to show to the community how you like to use `diffusers` 🪄.

+## Research Projects
+
+We also provide **research_projects** examples that are maintained by the community as defined in the respective research project folders. These examples are useful and offer the extended capabilities which are complementary to the official examples. You may refer to [research_projects](https://github.com/huggingface/diffusers/tree/main/examples/research_projects) for details.
+
 ## Important note

 To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
@@ -25,6 +25,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | K-Diffusion Stable Diffusion | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py) | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion) | -  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
 | Checkpoint Merger Pipeline | Diffusion Pipeline that enables merging of saved model checkpoints | [Checkpoint Merger Pipeline](#checkpoint-merger-pipeline)                   | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) | 
 Stable Diffusion v1.1-1.4 Comparison | Run all 4 model checkpoints for Stable Diffusion and compare their results together | [Stable Diffusion Comparison](#stable-diffusion-comparisons) | - | [Suvaditya Mukherjee](https://github.com/suvadityamuk) |
+MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt | [MagicMix](#magic-mix) | - | [Partho Das](https://github.com/daspartho) |



@@ -57,7 +58,7 @@ guided_pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="clip_guided_stable_diffusion",
    clip_model=clip_model,
    feature_extractor=feature_extractor,
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 guided_pipeline.enable_attention_slicing()
@@ -208,7 +209,7 @@ import torch
 pipe = DiffusionPipeline.from_pretrained(
    'hakurei/waifu-diffusion',
    custom_pipeline="lpw_stable_diffusion",
-    revision="fp16",
+    
    torch_dtype=torch.float16
 )
 pipe=pipe.to("cuda")
@@ -275,7 +276,7 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained(
    custom_pipeline="speech_to_image_diffusion",
    speech_model=model,
    speech_processor=processor,
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )

@@ -333,7 +334,7 @@ import torch
 pipe = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    custom_pipeline="wildcard_stable_diffusion",
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )
 prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
@@ -355,43 +356,45 @@ out = pipe(
 import torch as th
 import numpy as np
 import torchvision.utils as tvu
+
 from diffusers import DiffusionPipeline

+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--prompt", type=str, default="mystical trees | A magical pond | dark",
+                    help="use '|' as the delimiter to compose separate sentences.")
+parser.add_argument("--steps", type=int, default=50)
+parser.add_argument("--scale", type=float, default=7.5)
+parser.add_argument("--weights", type=str, default="7.5 | 7.5 | -7.5")
+parser.add_argument("--seed", type=int, default=2)
+parser.add_argument("--model_path", type=str, default="CompVis/stable-diffusion-v1-4")
+parser.add_argument("--num_images", type=int, default=1)
+args = parser.parse_args()
+
 has_cuda = th.cuda.is_available()
 device = th.device('cpu' if not has_cuda else 'cuda')

+prompt = args.prompt
+scale = args.scale
+steps = args.steps
+
 pipe = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    use_auth_token=True,
+    args.model_path,
    custom_pipeline="composable_stable_diffusion",
 ).to(device)

-
-def dummy(images, **kwargs):
-    return images, False
-
-pipe.safety_checker = dummy
+pipe.safety_checker = None

 images = []
-generator = torch.Generator("cuda").manual_seed(0)
+generator = th.Generator("cuda").manual_seed(args.seed)
+for i in range(args.num_images):
+    image = pipe(prompt, guidance_scale=scale, num_inference_steps=steps,
+                 weights=args.weights, generator=generator).images[0]
+    images.append(th.from_numpy(np.array(image)).permute(2, 0, 1) / 255.)
+grid = tvu.make_grid(th.stack(images, dim=0), nrow=4, padding=0)
+tvu.save_image(grid, f'{prompt}_{args.weights}' + '.png')

-seed = 0
-prompt = "a forest | a camel"
-weights = " 1 | 1"  # Equal weight to each prompt. Can be negative
-
-images = []
-for i in range(4):
-    res = pipe(
-        prompt,
-        guidance_scale=7.5,
-        num_inference_steps=50,
-        weights=weights,
-        generator=generator)
-    image = res.images[0]
-    images.append(image)
-
-for i, img in enumerate(images):
-    img.save(f"./composable_diffusion/image_{i}.png")
 ```

 ### Imagic Stable Diffusion
@@ -567,7 +570,7 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained(
    detection_pipeline=language_detection_pipeline,
    translation_model=trans_model,
    translation_tokenizer=trans_tokenizer,
-    revision="fp16",
+    
    torch_dtype=torch.float16,
 )

@@ -615,7 +618,7 @@ mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512))
 pipe = DiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    custom_pipeline="img2img_inpainting",
-    revision="fp16",
+    
    torch_dtype=torch.float16
 )
 pipe = pipe.to("cuda")
@@ -813,6 +816,50 @@ plt.title('Stable Diffusion v1.4')
 plt.axis('off')

 plt.show()
-```python
+```

-As a result, you can look at a grid of all 4 generated images being shown together, that captures a difference the advancement of the training between the 4 checkpoints.
+As a result, you can look at a grid of all 4 generated images being shown together, that captures a difference the advancement of the training between the 4 checkpoints.
+
+### Magic Mix
+
+Implementation of the [MagicMix: Semantic Mixing with Diffusion Models](https://arxiv.org/abs/2210.16056) paper. This is a Diffusion Pipeline for semantic mixing of an image and a text prompt to create a new concept while preserving the spatial layout and geometry of the subject in the image. The pipeline takes an image that provides the layout semantics and a prompt that provides the content semantics for the mixing process.
+
+There are 3 parameters for the method-
+- `mix_factor`: It is the interpolation constant used in the layout generation phase. The greater the value of `mix_factor`, the greater the influence of the prompt on the layout generation process.
+- `kmax` and `kmin`: These determine the range for the layout and content generation process. A higher value of kmax results in loss of more information about the layout of the original image and a higher value of kmin results in more steps for content generation process.
+
+Here is an example usage-
+
+```python
+from diffusers import DiffusionPipeline, DDIMScheduler
+from PIL import Image
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="magic_mix",
+    scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
+).to('cuda')
+
+img = Image.open('phone.jpg')
+mix_img = pipe(
+    img, 
+    prompt = 'bed', 
+    kmin = 0.3,
+    kmax = 0.5,
+    mix_factor = 0.5,
+    )
+mix_img.save('phone_bed_mix.jpg')
+```
+The `mix_img` is a PIL image that can be saved locally or displayed directly in a google colab. Generated image is a mix of the layout semantics of the given image and the content semantics of the prompt.
+
+E.g. the above script generates the following image:
+
+`phone.jpg`
+
+![206903102-34e79b9f-9ed2-4fac-bb38-82871343c655](https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg)
+
+`phone_bed_mix.jpg`
+
+![206903104-913a671d-ef53-4ae4-919d-64c3059c8f67](https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg)
+
+For more example generations check out this [demo notebook](https://github.com/daspartho/MagicMix/blob/main/demo.ipynb).
@@ -2,8 +2,7 @@ from typing import Optional, Tuple, Union

 import torch

-from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.pipeline_utils import ImagePipelineOutput
+from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, ImagePipelineOutput, UNet2DConditionModel
 from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
 from diffusers.schedulers.scheduling_ddpm import DDPMSchedulerOutput
 from einops import rearrange, reduce
@@ -5,13 +5,7 @@ from typing import Dict, List, Union
 import torch

 from diffusers import DiffusionPipeline, __version__
-from diffusers.pipeline_utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    ONNX_WEIGHTS_NAME,
-    SCHEDULER_CONFIG_NAME,
-    WEIGHTS_NAME,
-)
+from diffusers.utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, WEIGHTS_NAME
 from huggingface_hub import snapshot_download


@@ -1,25 +1,52 @@
-"""
-    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
-"""
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
-import warnings
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union

 import torch

+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+

 class ComposableStableDiffusionPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion.
+
    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -35,11 +62,12 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offsensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
        feature_extractor ([`CLIPFeatureExtractor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    _optional_components = ["safety_checker", "feature_extractor"]

    def __init__(
        self,
@@ -47,11 +75,84 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
    ):
        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
@@ -61,56 +162,265 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)

-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+    def enable_vae_slicing(self):
        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Enable sliced VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
+        batch_size = len(prompt) if isinstance(prompt, list) else 1

-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents

    @torch.no_grad()
    def __call__(
        self,
        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        eta: Optional[float] = 0.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
        generator: Optional[torch.Generator] = None,
        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
        weights: Optional[str] = "",
-        **kwargs,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
+
        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -121,6 +431,11 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
@@ -137,6 +452,13 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
@@ -144,182 +466,113 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
            (nsfw) content, according to the `safety_checker`.
        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor

-        if "torch_device" in kwargs:
-            device = kwargs.pop("torch_device")
-            warnings.warn(
-                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
-                " Consider using `pipe.to(torch_device)` instead."
-            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)

-            # Set device as before (to be removed in 0.3.0)
-            if device is None:
-                device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.to(device)
-
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0

        if "|" in prompt:
            prompt = [x.strip() for x in prompt.split("|")]
            print(f"composing {prompt}...")

-        # get prompt text embeddings
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
-
-        if not weights:
-            # specify weights for prompts (excluding the unconditional score)
-            print("using equal weights for all prompts...")
-            pos_weights = torch.tensor(
-                [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1), device=self.device
-            ).reshape(-1, 1, 1, 1)
-            neg_weights = torch.tensor([1.0], device=self.device).reshape(-1, 1, 1, 1)
-            mask = torch.tensor([False] + [True] * pos_weights.shape[0], dtype=torch.bool)
-        else:
-            # set prompt weight for each
-            num_prompts = len(prompt) if isinstance(prompt, list) else 1
-            weights = [float(w.strip()) for w in weights.split("|")]
-            if len(weights) < num_prompts:
-                weights.append(1.0)
-            weights = torch.tensor(weights, device=self.device)
-            assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
-            pos_weights = []
-            neg_weights = []
-            mask = []  # first one is unconditional score
-            for w in weights:
-                if w > 0:
-                    pos_weights.append(w)
-                    mask.append(True)
-                else:
-                    neg_weights.append(abs(w))
-                    mask.append(False)
-            # normalize the weights
-            pos_weights = torch.tensor(pos_weights, device=self.device).reshape(-1, 1, 1, 1)
-            pos_weights = pos_weights / pos_weights.sum()
-            neg_weights = torch.tensor(neg_weights, device=self.device).reshape(-1, 1, 1, 1)
-            neg_weights = neg_weights / neg_weights.sum()
-            mask = torch.tensor(mask, device=self.device, dtype=torch.bool)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            max_length = text_input.input_ids.shape[-1]
-
-            if torch.all(mask):
-                # no negative prompts, so we use empty string as the negative prompt
-                uncond_input = self.tokenizer(
-                    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
-                )
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-                # update negative weights
-                neg_weights = torch.tensor([1.0], device=self.device)
-                mask = torch.tensor([False] + mask.detach().tolist(), device=self.device, dtype=torch.bool)
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_device = "cpu" if self.device.type == "mps" else self.device
-        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
-        if latents is None:
-            latents = torch.randn(
-                latents_shape,
-                generator=generator,
-                device=latents_device,
-            )
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-        latents = latents.to(self.device)
-
-        # set timesteps
-        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-
-        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents * self.scheduler.sigmas[0]
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                torch.cat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
-            )
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[i]
-                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
-
-            # reduce memory by predicting each score sequentially
-            noise_preds = []
-            # predict the noise residual
-            for latent_in, text_embedding_in in zip(
-                torch.chunk(latent_model_input, chunks=latent_model_input.shape[0], dim=0),
-                torch.chunk(text_embeddings, chunks=text_embeddings.shape[0], dim=0),
-            ):
-                noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
-            noise_preds = torch.cat(noise_preds, dim=0)
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond = (noise_preds[~mask] * neg_weights).sum(dim=0, keepdims=True)
-                noise_pred_text = (noise_preds[mask] * pos_weights).sum(dim=0, keepdims=True)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
+            if not weights:
+                # specify weights for prompts (excluding the unconditional score)
+                print("using equal positive weights (conjunction) for all prompts...")
+                weights = torch.tensor([guidance_scale] * len(prompt), device=self.device).reshape(-1, 1, 1, 1)
            else:
+                # set prompt weight for each
+                num_prompts = len(prompt) if isinstance(prompt, list) else 1
+                weights = [float(w.strip()) for w in weights.split("|")]
+                # guidance scale as the default
+                if len(weights) < num_prompts:
+                    weights.append(guidance_scale)
+                else:
+                    weights = weights[:num_prompts]
+                assert len(weights) == len(prompt), "weights specified are not equal to the number of prompts"
+                weights = torch.tensor(weights, device=self.device).reshape(-1, 1, 1, 1)
+        else:
+            weights = guidance_scale
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # composable diffusion
+        if isinstance(prompt, list) and batch_size == 1:
+            # remove extra unconditional embedding
+            # N = one unconditional embed + conditional embeds
+            text_embeddings = text_embeddings[len(prompt) - 1 :]
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = []
+                for j in range(text_embeddings.shape[0]):
+                    noise_pred.append(
+                        self.unet(latent_model_input[:1], t, encoder_hidden_states=text_embeddings[j : j + 1]).sample
+                    )
+                noise_pred = torch.cat(noise_pred, dim=0)
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred[:1], noise_pred[1:]
+                    noise_pred = noise_pred_uncond + (weights * (noise_pred_text - noise_pred_uncond)).sum(
+                        dim=0, keepdims=True
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)

-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        # 8. Post-processing
+        image = self.decode_latents(latents)

-        # run safety checker
-        safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
-        image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values)
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

+        # 10. Convert to PIL
        if output_type == "pil":
            image = self.numpy_to_pil(image)

@@ -12,8 +12,8 @@ import torch.nn.functional as F

 import PIL
 from accelerate import Accelerator
+from diffusers import DiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -185,7 +185,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
            (nsfw) content, according to the `safety_checker`.
        """
        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
+        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
        image = init_image or image

        accelerator = Accelerator(
@@ -5,9 +5,9 @@ import numpy as np
 import torch

 import PIL
+from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -6,9 +6,9 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 import torch

+from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -759,7 +759,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
            (nsfw) content, according to the `safety_checker`.
        """
        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
+        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
        image = init_image or image

        # 0. Default height and width to unet
@@ -7,8 +7,7 @@ import torch

 import diffusers
 import PIL
-from diffusers import OnnxStableDiffusionPipeline, SchedulerMixin
-from diffusers.onnx_utils import OnnxRuntimeModel
+from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, SchedulerMixin
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import deprecate, logging
 from packaging import version
@@ -16,7 +15,7 @@ from transformers import CLIPFeatureExtractor, CLIPTokenizer


 try:
-    from diffusers.onnx_utils import ORT_TO_NP_TYPE
+    from diffusers.pipelines.onnx_utils import ORT_TO_NP_TYPE
 except ImportError:
    ORT_TO_NP_TYPE = {
        "tensor(bool)": np.bool_,
@@ -746,7 +745,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            (nsfw) content, according to the `safety_checker`.
        """
        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
+        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
        image = init_image or image

        # 0. Default height and width to unet
@@ -0,0 +1,152 @@
+from typing import Union
+
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from PIL import Image
+from torchvision import transforms as tfms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+
+class MagicMixPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    # convert PIL image to latents
+    def encode(self, img):
+        with torch.no_grad():
+            latent = self.vae.encode(tfms.ToTensor()(img).unsqueeze(0).to(self.device) * 2 - 1)
+            latent = 0.18215 * latent.latent_dist.sample()
+        return latent
+
+    # convert latents to PIL image
+    def decode(self, latent):
+        latent = (1 / 0.18215) * latent
+        with torch.no_grad():
+            img = self.vae.decode(latent).sample
+        img = (img / 2 + 0.5).clamp(0, 1)
+        img = img.detach().cpu().permute(0, 2, 3, 1).numpy()
+        img = (img * 255).round().astype("uint8")
+        return Image.fromarray(img[0])
+
+    # convert prompt into text embeddings, also unconditional embeddings
+    def prep_text(self, prompt):
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]
+
+        uncond_input = self.tokenizer(
+            "",
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        uncond_embedding = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+        return torch.cat([uncond_embedding, text_embedding])
+
+    def __call__(
+        self,
+        img: Image.Image,
+        prompt: str,
+        kmin: float = 0.3,
+        kmax: float = 0.6,
+        mix_factor: float = 0.5,
+        seed: int = 42,
+        steps: int = 50,
+        guidance_scale: float = 7.5,
+    ) -> Image.Image:
+        tmin = steps - int(kmin * steps)
+        tmax = steps - int(kmax * steps)
+
+        text_embeddings = self.prep_text(prompt)
+
+        self.scheduler.set_timesteps(steps)
+
+        width, height = img.size
+        encoded = self.encode(img)
+
+        torch.manual_seed(seed)
+        noise = torch.randn(
+            (1, self.unet.in_channels, height // 8, width // 8),
+        ).to(self.device)
+
+        latents = self.scheduler.add_noise(
+            encoded,
+            noise,
+            timesteps=self.scheduler.timesteps[tmax],
+        )
+
+        input = torch.cat([latents] * 2)
+
+        input = self.scheduler.scale_model_input(input, self.scheduler.timesteps[tmax])
+
+        with torch.no_grad():
+            pred = self.unet(
+                input,
+                self.scheduler.timesteps[tmax],
+                encoder_hidden_states=text_embeddings,
+            ).sample
+
+        pred_uncond, pred_text = pred.chunk(2)
+        pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)
+
+        latents = self.scheduler.step(pred, self.scheduler.timesteps[tmax], latents).prev_sample
+
+        for i, t in enumerate(tqdm(self.scheduler.timesteps)):
+            if i > tmax:
+                if i < tmin:  # layout generation phase
+                    orig_latents = self.scheduler.add_noise(
+                        encoded,
+                        noise,
+                        timesteps=t,
+                    )
+
+                    input = (mix_factor * latents) + (
+                        1 - mix_factor
+                    ) * orig_latents  # interpolating between layout noise and conditionally generated noise to preserve layout sematics
+                    input = torch.cat([input] * 2)
+
+                else:  # content generation phase
+                    input = torch.cat([latents] * 2)
+
+                input = self.scheduler.scale_model_input(input, t)
+
+                with torch.no_grad():
+                    pred = self.unet(
+                        input,
+                        t,
+                        encoder_hidden_states=text_embeddings,
+                    ).sample
+
+                pred_uncond, pred_text = pred.chunk(2)
+                pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)
+
+                latents = self.scheduler.step(pred, t, latents).prev_sample
+
+        return self.decode(latents)
@@ -3,9 +3,9 @@ from typing import Callable, List, Optional, Union

 import torch

+from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -18,8 +18,7 @@ from typing import Callable, List, Optional, Union

 import torch

-from diffusers import LMSDiscreteScheduler
-from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers import DiffusionPipeline, LMSDiscreteScheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import is_accelerate_available, logging
 from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
@@ -6,8 +6,8 @@ from typing import Callable, List, Optional, Union

 import torch

+from diffusers import DiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -3,9 +3,9 @@ from typing import Callable, List, Optional, Union
 import torch

 import PIL
+from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -7,9 +7,9 @@ from typing import Callable, Dict, List, Optional, Union

 import torch

+from diffusers import DiffusionPipeline
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -68,7 +68,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
    Example Usage:
        pipe = WildcardStableDiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
-            revision="fp16",
+
            torch_dtype=torch.float16,
        )
        prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
@@ -44,20 +44,6 @@ write_basic_config()

 ### Dog toy example

-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
-
-You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
-
-Run the following command to authenticate your token
-
-```bash
-huggingface-cli login
-```
-
-If you have already cloned the repo, then you won't need to go through these steps.
-
-<br>
-
 Now let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data.

 And launch the training using
@@ -246,8 +232,11 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
 image.save("dog-bucket.png")
 ```

+### Inference from a training checkpoint

-## Running with Flax/JAX
+You can also perform inference from one of the checkpoints saved during the training process, if you used the `--checkpointing_steps` argument. Please, refer to [the documentation](https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint) to see how to do it.
+
+## Training with Flax/JAX

 For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.

@@ -328,96 +317,7 @@ python train_dreambooth_flax.py \
  --max_train_steps=800
 ```

-### Training with prior-preservation loss
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.

-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-
-### Training with gradient checkpointing and 8-bit optimizer:
-
-With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
-
-To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=2 --gradient_checkpointing \
-  --use_8bit_adam \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-
-### Fine-tune text encoder with the UNet.
-
-The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
-Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
-
-___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-inpainting"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-accelerate launch train_dreambooth_inpaint.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --train_text_encoder \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --use_8bit_adam \
-  --gradient_checkpointing \
-  --learning_rate=2e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
+You can also use Dreambooth to train the specialized in-painting model. See [the script in the research folder for details](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/dreambooth_inpaint).
@@ -1,6 +1,7 @@
 import argparse
 import hashlib
 import itertools
+import logging
 import math
 import os
 import warnings
@@ -12,6 +13,9 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.utils.data import Dataset

+import datasets
+import diffusers
+import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
@@ -155,7 +159,8 @@ def parse_args(input_args=None):
        type=int,
        default=500,
        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
            " training using `--resume_from_checkpoint`."
        ),
    )
@@ -203,6 +208,13 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
    parser.add_argument(
        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
    )
@@ -228,6 +240,23 @@ def parse_args(input_args=None):
            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
        ),
    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
@@ -239,7 +268,20 @@ def parse_args(input_args=None):
            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
        ),
    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -401,7 +443,7 @@ def main(args):
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
+        log_with=args.report_to,
        logging_dir=logging_dir,
    )

@@ -414,9 +456,27 @@ def main(args):
            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
        )

+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

+    # Generate class images if prior preservation is enabled.
    if args.with_prior_preservation:
        class_images_dir = Path(args.class_data_dir)
        if not class_images_dir.exists():
@@ -425,6 +485,12 @@ def main(args):

        if cur_class_images < args.num_class_images:
            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
            pipeline = DiffusionPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
                torch_dtype=torch_dtype,
@@ -475,11 +541,7 @@ def main(args):

    # Load the tokenizer
    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer_name,
-            revision=args.revision,
-            use_fast=False,
-        )
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
    elif args.pretrained_model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.pretrained_model_name_or_path,
@@ -491,41 +553,36 @@ def main(args):
    # import correct text encoder class
    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)

-    # Load models and create wrapper for stable diffusion
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
    )

-    if is_xformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warning(
-                "Could not enable memory efficient attention. Make sure xformers is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
    vae.requires_grad_(False)
    if not args.train_text_encoder:
        text_encoder.requires_grad_(False)

+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
    if args.gradient_checkpointing:
        unet.enable_gradient_checkpointing()
        if args.train_text_encoder:
            text_encoder.gradient_checkpointing_enable()

+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
    if args.scale_lr:
        args.learning_rate = (
            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
@@ -544,6 +601,7 @@ def main(args):
    else:
        optimizer_class = torch.optim.AdamW

+    # Optimizer creation
    params_to_optimize = (
        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
    )
@@ -555,8 +613,7 @@ def main(args):
        eps=args.adam_epsilon,
    )

-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
+    # Dataset and DataLoaders creation:
    train_dataset = DreamBoothDataset(
        instance_data_root=args.instance_data_dir,
        instance_prompt=args.instance_prompt,
@@ -587,8 +644,11 @@ def main(args):
        optimizer=optimizer,
        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
    )

+    # Prepare everything with our `accelerator`.
    if args.train_text_encoder:
        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
@@ -597,17 +657,16 @@ def main(args):
        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
            unet, optimizer, train_dataloader, lr_scheduler
        )
-    accelerator.register_for_checkpointing(lr_scheduler)

+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

-    # Move text_encode and vae to gpu.
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
+    # Move vae and text_encoder to device and cast to weight_dtype
    vae.to(accelerator.device, dtype=weight_dtype)
    if not args.train_text_encoder:
        text_encoder.to(accelerator.device, dtype=weight_dtype)
@@ -638,6 +697,7 @@ def main(args):
    global_step = 0
    first_epoch = 0

+    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint != "latest":
            path = os.path.basename(args.resume_from_checkpoint)
@@ -706,7 +766,7 @@ def main(args):
                    target, target_prior = torch.chunk(target, 2, dim=0)

                    # Compute instance loss
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")

                    # Compute prior loss
                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
@@ -746,9 +806,8 @@ def main(args):
            if global_step >= args.max_train_steps:
                break

-        accelerator.wait_for_everyone()
-
    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        pipeline = DiffusionPipeline.from_pretrained(
            args.pretrained_model_name_or_path,
@@ -142,15 +142,6 @@ def parse_args():
        default=False,
        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -429,6 +420,13 @@ def main():
        return batch

    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    if len(train_dataset) < total_train_batch_size:
+        raise ValueError(
+            f"Training batch size is {total_train_batch_size}, but your dataset only contains"
+            f" {len(train_dataset)} images. Please, use a larger dataset or reduce the effective batch size. Note that"
+            f" there are {jax.local_device_count()} parallel devices, so your batch size can't be smaller than that."
+        )
+
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=total_train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True
    )
@@ -477,6 +475,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()

    # Initialize our training
    train_rngs = jax.random.split(rng, jax.local_device_count())
@@ -513,7 +512,7 @@ def main():

            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)

            # Get the text embedding for conditioning
            if args.train_text_encoder:
@@ -23,4 +23,96 @@ accelerate launch train_dreambooth_inpaint.py \
  --max_train_steps=400
 ```

-The script is also compatible with prior preservation loss and gradient checkpointing
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+
+### Training with gradient checkpointing and 8-bit optimizer:
+
+With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
+
+To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-inpainting"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth_inpaint.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam \
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
@@ -242,6 +242,25 @@ def parse_args():
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint and are suitable for resuming training"
+            " using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -591,6 +610,7 @@ def main():
        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
            unet, optimizer, train_dataloader, lr_scheduler
        )
+    accelerator.register_for_checkpointing(lr_scheduler)

    weight_dtype = torch.float32
    if args.mixed_precision == "fp16":
@@ -628,14 +648,39 @@ def main():
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
    global_step = 0
+    first_epoch = 0

-    for epoch in range(args.num_train_epochs):
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
        unet.train()
        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
            with accelerator.accumulate(unet):
                # Convert images to latent space

@@ -719,6 +764,12 @@ def main():
                progress_bar.update(1)
                global_step += 1

+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
@@ -0,0 +1,17 @@
+## Diffusers examples with Intel optimizations
+
+**This research project is not actively maintained by the diffusers team. For any questions or comments, please make sure to tag @hshen14 .**
+
+This aims to provide diffusers examples with Intel optimizations such as Bfloat16 for training/fine-tuning acceleration and 8-bit integer (INT8) for inference acceleration on Intel platforms.
+
+## Accelerating the fine-tuning for textual inversion
+
+We accelereate the fine-tuning for textual inversion with Intel Extension for PyTorch. The [examples](textual_inversion) enable both single node and multi-node distributed training with Bfloat16 support on Intel Xeon Scalable Processor.
+
+## Accelerating the inference for Stable Diffusion using Bfloat16
+
+We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support.
+
+## Accelerating the inference for Stable Diffusion using INT8
+
+Coming soon ...
@@ -0,0 +1,49 @@
+import torch
+
+import intel_extension_for_pytorch as ipex
+from diffusers import StableDiffusionPipeline
+from PIL import Image
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+prompt = ["a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"]
+batch_size = 8
+prompt = prompt * batch_size
+
+device = "cpu"
+model_id = "path-to-your-trained-model"
+model = StableDiffusionPipeline.from_pretrained(model_id)
+model = model.to(device)
+
+# to channels last
+model.unet = model.unet.to(memory_format=torch.channels_last)
+model.vae = model.vae.to(memory_format=torch.channels_last)
+model.text_encoder = model.text_encoder.to(memory_format=torch.channels_last)
+model.safety_checker = model.safety_checker.to(memory_format=torch.channels_last)
+
+# optimize with ipex
+model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True)
+model.vae = ipex.optimize(model.vae.eval(), dtype=torch.bfloat16, inplace=True)
+model.text_encoder = ipex.optimize(model.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+model.safety_checker = ipex.optimize(model.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
+
+# compute
+seed = 666
+generator = torch.Generator(device).manual_seed(seed)
+with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+    images = model(prompt, guidance_scale=7.5, num_inference_steps=50, generator=generator).images
+
+    # save image
+    grid = image_grid(images, rows=2, cols=4)
+    grid.save(model_id + ".png")
@@ -0,0 +1,68 @@
+## Textual Inversion fine-tuning example
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images using just 3-5 examples.
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Training with Intel Extension for PyTorch
+
+Intel Extension for PyTorch provides the optimizations for faster training and inference on CPUs. You can leverage the training example "textual_inversion.py". Follow the [instructions](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) to get the model and [dataset](https://huggingface.co/sd-concepts-library/dicoo2) before running the script.
+
+The example supports both single node and multi-node distributed training:
+
+### Single node training
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-dicoo-images"
+
+python textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --seed=7 \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=3000 \
+  --learning_rate=2.5e-03 --scale_lr \
+  --output_dir="textual_inversion_dicoo"
+```
+
+Note: Bfloat16 is available on Intel Xeon Scalable Processors Cooper Lake or Sapphire Rapids. You may not get performance speedup without Bfloat16 support.
+
+### Multi-node distributed training
+
+Before running the scripts, make sure to install the library's training dependencies successfully:
+
+```bash
+python -m pip install oneccl_bind_pt==1.13 -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="path-to-dir-containing-dicoo-images"
+
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+
+python -m intel_extension_for_pytorch.cpu.launch --distributed \
+  --hostfile hostfile --nnodes 2 --nproc_per_node 2 textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --seed=7 \ 
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=750 \
+  --learning_rate=2.5e-03 --scale_lr \
+  --output_dir="textual_inversion_dicoo"
+```
+The above is a simple distributed training usage on 2 nodes with 2 processes on each node. Add the right hostname or ip address in the "hostfile" and make sure these 2 nodes are reachable from each other. For more details, please refer to the [user guide](https://github.com/intel/torch-ccl).
+
+
+### Reference
+
+We publish a [Medium blog](https://medium.com/intel-analytics-software/personalized-stable-diffusion-with-few-shot-fine-tuning-on-a-single-cpu-f01a3316b13) on how to create your own Stable Diffusion model on CPUs using textual inversion. Try it out now, if you have interests.
@@ -0,0 +1,7 @@
+accelerate
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+modelcards
+intel_extension_for_pytorch>=1.13
@@ -0,0 +1,645 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import intel_extension_for_pytorch as ipex
+import PIL
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+from huggingface_hub import HfFolder, Repository, whoami
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+
+
+logger = get_logger(__name__)
+
+
+def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--only_save_embeds",
+        action="store_true",
+        default=False,
+        help="Save only the embeddings for the new concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (h, w,) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
+
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    # Freeze all parameters except for the token embeddings in text encoder
+    params_to_freeze = itertools.chain(
+        text_encoder.text_model.encoder.parameters(),
+        text_encoder.text_model.final_layer_norm.parameters(),
+        text_encoder.text_model.embeddings.position_embedding.parameters(),
+    )
+    freeze_params(params_to_freeze)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # Move vae and unet to device
+    vae.to(accelerator.device)
+    unet.to(accelerator.device)
+
+    # Keep vae and unet in eval model as we don't train these
+    vae.eval()
+    unet.eval()
+
+    unet = ipex.optimize(unet, dtype=torch.bfloat16, inplace=True)
+    vae = ipex.optimize(vae, dtype=torch.bfloat16, inplace=True)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    text_encoder.train()
+    text_encoder, optimizer = ipex.optimize(text_encoder, optimizer=optimizer, dtype=torch.bfloat16)
+
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+                with accelerator.accumulate(text_encoder):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                    latents = latents * 0.18215
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn(latents.shape).to(latents.device)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    ).long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                    loss = F.mse_loss(model_pred, target, reduction="none").mean([1, 2, 3]).mean()
+                    accelerator.backward(loss)
+
+                    # Zero out the gradients for all token embeddings except the newly added
+                    # embeddings for the concept, as we only want to optimize the concept embeddings
+                    if accelerator.num_processes > 1:
+                        grads = text_encoder.module.get_input_embeddings().weight.grad
+                    else:
+                        grads = text_encoder.get_input_embeddings().weight.grad
+                    # Get the index for tokens that we want to zero the grads for
+                    index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+                    grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        accelerator.wait_for_everyone()
+
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        if args.push_to_hub and args.only_save_embeds:
+            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
+            save_full_model = True
+        else:
+            save_full_model = not args.only_save_embeds
+        if save_full_model:
+            pipeline = StableDiffusionPipeline(
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                vae=vae,
+                unet=unet,
+                tokenizer=tokenizer,
+                scheduler=PNDMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler"),
+                safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
+                feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+            )
+            pipeline.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+        save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
@@ -160,3 +160,6 @@ python train_text_to_image_flax.py \
  --max_grad_norm=1 \
  --output_dir="sd-pokemon-model"
 ```
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
@@ -1,4 +1,5 @@
 import argparse
+import copy
 import logging
 import math
 import os
@@ -11,6 +12,9 @@ import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint

+import datasets
+import diffusers
+import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
@@ -28,7 +32,7 @@ from transformers import CLIPTextModel, CLIPTokenizer
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.10.0.dev0")

-logger = get_logger(__name__)
+logger = get_logger(__name__, log_level="INFO")


 def parse_args():
@@ -171,7 +175,25 @@ def parse_args():
    parser.add_argument(
        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -210,12 +232,32 @@ def parse_args():
        type=str,
        default="tensorboard",
        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
-            "Only applicable when `--with_tracking` is passed."
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -226,6 +268,10 @@ def parse_args():
    if args.dataset_name is None and args.train_data_dir is None:
        raise ValueError("Need either a dataset name or a training folder.")

+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
    return args


@@ -254,27 +300,24 @@ class EMAModel:
        parameters = list(parameters)
        self.shadow_params = [p.clone().detach() for p in parameters]

+        self.collected_params = None
+
        self.decay = decay
        self.optimization_step = 0

-    def get_decay(self, optimization_step):
-        """
-        Compute the decay factor for the exponential moving average.
-        """
-        value = (1 + optimization_step) / (10 + optimization_step)
-        return 1 - min(self.decay, value)
-
    @torch.no_grad()
    def step(self, parameters):
        parameters = list(parameters)

        self.optimization_step += 1
-        self.decay = self.get_decay(self.optimization_step)
+
+        # Compute the decay factor for the exponential moving average.
+        value = (1 + self.optimization_step) / (10 + self.optimization_step)
+        one_minus_decay = 1 - min(self.decay, value)

        for s_param, param in zip(self.shadow_params, parameters):
            if param.requires_grad:
-                tmp = self.decay * (s_param - param)
-                s_param.sub_(tmp)
+                s_param.sub_(one_minus_decay * (s_param - param))
            else:
                s_param.copy_(param)

@@ -306,6 +349,55 @@ class EMAModel:
            for p in self.shadow_params
        ]

+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "optimization_step": self.optimization_step,
+            "shadow_params": self.shadow_params,
+            "collected_params": self.collected_params,
+        }
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Loads the ExponentialMovingAverage state.
+        This method is used by accelerate during checkpointing to save the ema state dict.
+        Args:
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+
+        self.decay = state_dict["decay"]
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.optimization_step = state_dict["optimization_step"]
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+
+        self.shadow_params = state_dict["shadow_params"]
+        if not isinstance(self.shadow_params, list):
+            raise ValueError("shadow_params must be a list")
+        if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+            raise ValueError("shadow_params must all be Tensors")
+
+        self.collected_params = state_dict["collected_params"]
+        if self.collected_params is not None:
+            if not isinstance(self.collected_params, list):
+                raise ValueError("collected_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.collected_params):
+                raise ValueError("collected_params must all be Tensors")
+            if len(self.collected_params) != len(self.shadow_params):
+                raise ValueError("collected_params and shadow_params must have the same length")
+

 def main():
    args = parse_args()
@@ -318,11 +410,21 @@ def main():
        logging_dir=logging_dir,
    )

+    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
@@ -345,42 +447,44 @@ def main():
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)

-    # Load models and create wrapper for stable diffusion
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    tokenizer = CLIPTokenizer.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
    )
    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
    )

-    if is_xformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warning(
-                "Could not enable memory efficient attention. Make sure xformers is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
    # Freeze vae and text_encoder
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)

+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+        )
+        ema_unet = EMAModel(ema_unet.parameters())
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
    if args.gradient_checkpointing:
        unet.enable_gradient_checkpointing()

+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
    if args.scale_lr:
        args.learning_rate = (
            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
@@ -406,7 +510,6 @@ def main():
        weight_decay=args.adam_weight_decay,
        eps=args.adam_epsilon,
    )
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")

    # Get the datasets: you can either provide your own training and evaluation files (see below)
    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -469,13 +572,15 @@ def main():
                raise ValueError(
                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
                )
-        inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
-        input_ids = inputs.input_ids
-        return input_ids
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids

+    # Preprocessing the datasets.
    train_transforms = transforms.Compose(
        [
-            transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
            transforms.ToTensor(),
@@ -487,7 +592,6 @@ def main():
        images = [image.convert("RGB") for image in examples[image_column]]
        examples["pixel_values"] = [train_transforms(image) for image in images]
        examples["input_ids"] = tokenize_captions(examples)
-
        return examples

    with accelerator.main_process_first():
@@ -499,14 +603,10 @@ def main():
    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-        input_ids = [example["input_ids"] for example in examples]
-        padded_tokens = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt")
-        return {
-            "pixel_values": pixel_values,
-            "input_ids": padded_tokens.input_ids,
-            "attention_mask": padded_tokens.attention_mask,
-        }
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}

+    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.train_batch_size
    )
@@ -525,25 +625,26 @@ def main():
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
    )

+    # Prepare everything with our `accelerator`.
    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        unet, optimizer, train_dataloader, lr_scheduler
    )
+    if args.use_ema:
+        accelerator.register_for_checkpointing(ema_unet)

+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

-    # Move text_encode and vae to gpu.
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
+    # Move text_encode and vae to gpu and cast to weight_dtype
    text_encoder.to(accelerator.device, dtype=weight_dtype)
    vae.to(accelerator.device, dtype=weight_dtype)
-
-    # Create EMA for the unet.
    if args.use_ema:
-        ema_unet = EMAModel(unet.parameters())
+        ema_unet.to(accelerator.device)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -567,16 +668,41 @@ def main():
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
    progress_bar.set_description("Steps")
-    global_step = 0

-    for epoch in range(args.num_train_epochs):
+    for epoch in range(first_epoch, args.num_train_epochs):
        unet.train()
        train_loss = 0.0
        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
            with accelerator.accumulate(unet):
                # Convert images to latent space
                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
@@ -629,6 +755,12 @@ def main():
                accelerator.log({"train_loss": train_loss}, step=global_step)
                train_loss = 0.0

+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)

@@ -182,9 +182,8 @@ def parse_args():
        type=str,
        default="tensorboard",
        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
-            "Only applicable when `--with_tracking` is passed."
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument(
@@ -333,7 +332,7 @@ def main():

    train_transforms = transforms.Compose(
        [
-            transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
            transforms.ToTensor(),
@@ -417,6 +416,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()

    # Initialize our training
    rng = jax.random.PRNGKey(args.seed)
@@ -449,7 +449,7 @@ def main():

            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)

            # Get the text embedding for conditioning
            encoder_hidden_states = text_encoder(
@@ -124,3 +124,6 @@ python textual_inversion_flax.py \
  --output_dir="textual_inversion_cat"
 ```
 It should be at least 70% faster than the PyTorch script with the same configuration.
+
+### Training with xformers:
+You can enable memory efficient attention by [installing xFormers](https://github.com/facebookresearch/xformers#installing-xformers) and padding the `--enable_xformers_memory_efficient_attention` argument to the script. This is not available with the Flax/JAX implementation.
@@ -1,5 +1,5 @@
 import argparse
-import itertools
+import logging
 import math
 import os
 import random
@@ -12,13 +12,15 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.utils.data import Dataset

+import datasets
+import diffusers
 import PIL
+import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
-from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import HfFolder, Repository, whoami
@@ -28,7 +30,7 @@ from packaging import version
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPTextModel, CLIPTokenizer


 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
@@ -148,6 +150,11 @@ def parse_args():
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
    parser.add_argument(
        "--learning_rate",
        type=float,
@@ -204,7 +211,45 @@ def parse_args():
            "and an Nvidia Ampere GPU."
        ),
    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -336,7 +381,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            h, w, = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
@@ -363,11 +408,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
        return f"{organization}/{model_id}"


-def freeze_params(params):
-    for param in params:
-        param.requires_grad = False
-
-
 def main():
    args = parse_args()
    logging_dir = os.path.join(args.output_dir, args.logging_dir)
@@ -375,10 +415,26 @@ def main():
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
+        log_with=args.report_to,
        logging_dir=logging_dir,
    )

+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)
@@ -400,12 +456,22 @@ def main():
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)

-    # Load the tokenizer and add the placeholder token as a additional special token
+    # Load tokenizer
    if args.tokenizer_name:
        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
    elif args.pretrained_model_name_or_path:
        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")

+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
    # Add the placeholder token in tokenizer
    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
    if num_added_tokens == 0:
@@ -423,32 +489,6 @@ def main():
    initializer_token_id = token_ids[0]
    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)

-    # Load models and create wrapper for stable diffusion
-    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
-    )
-
-    if is_xformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warning(
-                "Could not enable memory efficient attention. Make sure xformers is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
    # Resize the token embeddings as we are adding new special tokens to the tokenizer
    text_encoder.resize_token_embeddings(len(tokenizer))

@@ -457,15 +497,30 @@ def main():
    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]

    # Freeze vae and unet
-    freeze_params(vae.parameters())
-    freeze_params(unet.parameters())
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
    # Freeze all parameters except for the token embeddings in text encoder
-    params_to_freeze = itertools.chain(
-        text_encoder.text_model.encoder.parameters(),
-        text_encoder.text_model.final_layer_norm.parameters(),
-        text_encoder.text_model.embeddings.position_embedding.parameters(),
-    )
-    freeze_params(params_to_freeze)
+    text_encoder.text_model.encoder.requires_grad_(False)
+    text_encoder.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        # Keep unet in train mode if we are using gradient checkpointing to save memory.
+        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
+        unet.train()
+        text_encoder.gradient_checkpointing_enable()
+        unet.enable_gradient_checkpointing()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True

    if args.scale_lr:
        args.learning_rate = (
@@ -481,8 +536,7 @@ def main():
        eps=args.adam_epsilon,
    )

-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
+    # Dataset and DataLoaders creation:
    train_dataset = TextualInversionDataset(
        data_root=args.train_data_dir,
        tokenizer=tokenizer,
@@ -509,17 +563,22 @@ def main():
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
    )

+    # Prepare everything with our `accelerator`.
    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        text_encoder, optimizer, train_dataloader, lr_scheduler
    )

-    # Move vae and unet to device
-    vae.to(accelerator.device)
-    unet.to(accelerator.device)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16

-    # Keep vae and unet in eval model as we don't train these
-    vae.eval()
-    unet.eval()
+    # Move vae and unet to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -543,36 +602,61 @@ def main():
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")

    # keep original embeddings as reference
-    orig_embeds_params = text_encoder.get_input_embeddings().weight.data.clone()
+    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()

-    for epoch in range(args.num_train_epochs):
+    for epoch in range(first_epoch, args.num_train_epochs):
        text_encoder.train()
        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
            with accelerator.accumulate(text_encoder):
                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
                latents = latents * 0.18215

                # Sample noise that we'll add to the latents
-                noise = torch.randn(latents.shape).to(latents.device)
+                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
-                ).long()
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)

                # Predict the noise residual
                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
@@ -585,7 +669,8 @@ def main():
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

-                loss = F.mse_loss(model_pred, target, reduction="none").mean([1, 2, 3]).mean()
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
                accelerator.backward(loss)

                optimizer.step()
@@ -595,7 +680,9 @@ def main():
                # Let's make sure we don't update any embedding weights besides the newly added token
                index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id
                with torch.no_grad():
-                    text_encoder.get_input_embeddings().weight[index_no_updates] = orig_embeds_params[index_no_updates]
+                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
@@ -605,6 +692,12 @@ def main():
                    save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)

+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
@@ -612,9 +705,8 @@ def main():
            if global_step >= args.max_train_steps:
                break

-        accelerator.wait_for_everyone()
-
    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        if args.push_to_hub and args.only_save_embeds:
            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
@@ -622,14 +714,12 @@ def main():
        else:
            save_full_model = not args.only_save_embeds
        if save_full_model:
-            pipeline = StableDiffusionPipeline(
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
                text_encoder=accelerator.unwrap_model(text_encoder),
                vae=vae,
                unet=unet,
                tokenizer=tokenizer,
-                scheduler=PNDMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler"),
-                safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
-                feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
            )
            pipeline.save_pretrained(args.output_dir)
        # Save the newly trained embeddings
@@ -306,7 +306,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            h, w, = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
@@ -505,6 +505,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()

    # Initialize our training
    train_rngs = jax.random.split(rng, jax.local_device_count())
@@ -531,7 +532,7 @@ def main():
                0,
                noise_scheduler.config.num_train_timesteps,
            )
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
            encoder_hidden_states = state.apply_fn(
                batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
            )[0]
@@ -173,6 +173,16 @@ def parse_args():
    parser.add_argument(
        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
    )
+    parser.add_argument(
+        "--logger",
+        type=str,
+        default="tensorboard",
+        choices=["tensorboard", "wandb"],
+        help=(
+            "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
+            " for experiment tracking and logging of model metrics and model checkpoints"
+        ),
+    )
    parser.add_argument(
        "--logging_dir",
        type=str,
@@ -194,7 +204,6 @@ def parse_args():
            "and an Nvidia Ampere GPU."
        ),
    )
-
    parser.add_argument(
        "--prediction_type",
        type=str,
@@ -202,9 +211,26 @@ def parse_args():
        choices=["epsilon", "sample"],
        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
    )
-
    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -232,7 +258,7 @@ def main(args):
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
+        log_with=args.logger,
        logging_dir=logging_dir,
    )

@@ -319,6 +345,7 @@ def main(args):
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )
+    accelerator.register_for_checkpointing(lr_scheduler)

    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

@@ -351,11 +378,36 @@ def main(args):
        accelerator.init_trackers(run)

    global_step = 0
-    for epoch in range(args.num_epochs):
+    first_epoch = 0
+
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    for epoch in range(first_epoch, args.num_epochs):
        model.train()
        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")
        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
            clean_images = batch["input"]
            # Sample noise that we'll add to the images
            noise = torch.randn(clean_images.shape).to(clean_images.device)
@@ -402,6 +454,12 @@ def main(args):
                progress_bar.update(1)
                global_step += 1

+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            if args.use_ema:
                logs["ema_decay"] = ema_model.decay
@@ -429,9 +487,11 @@ def main(args):

                # denormalize the images and save to tensorboard
                images_processed = (images * 255).round().astype("uint8")
-                accelerator.trackers[0].writer.add_images(
-                    "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
-                )
+
+                if args.logger == "tensorboard":
+                    accelerator.get_tracker("tensorboard").add_images(
+                        "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
+                    )

            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                # save the model
@@ -174,6 +174,16 @@ def parse_args():
    parser.add_argument(
        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
    )
+    parser.add_argument(
+        "--logger",
+        type=str,
+        default="tensorboard",
+        choices=["tensorboard", "wandb"],
+        help=(
+            "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
+            " for experiment tracking and logging of model metrics and model checkpoints"
+        ),
+    )
    parser.add_argument(
        "--logging_dir",
        type=str,
@@ -195,7 +205,6 @@ def parse_args():
            "and an Nvidia Ampere GPU."
        ),
    )
-
    parser.add_argument(
        "--prediction_type",
        type=str,
@@ -206,6 +215,24 @@ def parse_args():

    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -233,7 +260,7 @@ def main(args):
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
-        log_with="tensorboard",
+        log_with=args.logger,
        logging_dir=logging_dir,
    )

@@ -260,7 +287,6 @@ def main(args):
            "UpBlock2D",
        ),
    )
-    model = ORTModule(model)
    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())

    if accepts_prediction_type:
@@ -321,6 +347,7 @@ def main(args):
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )
+    accelerator.register_for_checkpointing(lr_scheduler)

    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

@@ -331,6 +358,8 @@ def main(args):
        max_value=args.ema_max_decay,
    )

+    model = ORTModule(model)
+
    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
@@ -353,11 +382,34 @@ def main(args):
        accelerator.init_trackers(run)

    global_step = 0
-    for epoch in range(args.num_epochs):
+    first_epoch = 0
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    for epoch in range(first_epoch, args.num_epochs):
        model.train()
        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")
        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
            clean_images = batch["input"]
            # Sample noise that we'll add to the images
            noise = torch.randn(clean_images.shape).to(clean_images.device)
@@ -373,7 +425,7 @@ def main(args):

            with accelerator.accumulate(model):
                # Predict the noise residual
-                model_output = model(noisy_images, timesteps, return_dict=True)[0]
+                model_output = model(noisy_images, timesteps, return_dict=False)[0]

                if args.prediction_type == "epsilon":
                    loss = F.mse_loss(model_output, noise)  # this could have different weights!
@@ -404,6 +456,12 @@ def main(args):
                progress_bar.update(1)
                global_step += 1

+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            if args.use_ema:
                logs["ema_decay"] = ema_model.decay
@@ -431,9 +489,11 @@ def main(args):

                # denormalize the images and save to tensorboard
                images_processed = (images * 255).round().astype("uint8")
-                accelerator.trackers[0].writer.add_images(
-                    "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
-                )
+
+                if args.logger == "tensorboard":
+                    accelerator.get_tracker("tensorboard").add_images(
+                        "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
+                    )

            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                # save the model
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Patrick von Platen	6530f8d592	up	2023-01-03 16:49:38 +01:00
Patrick von Platen	6cc1f9137e	up	2023-01-03 16:46:13 +01:00