Compare commits
37 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c83683fdd1 | |||
| 6d6d19d7fc | |||
| baab065679 | |||
| 509741aea7 | |||
| e1df77ee1e | |||
| fdb1baa05c | |||
| 6529ee67ec | |||
| df2bc5ef28 | |||
| a7bf77fc28 | |||
| 0f0defdb65 | |||
| 19df9f3ec0 | |||
| d6ca120987 | |||
| fb7ae0184f | |||
| 70f8d4b488 | |||
| 6c60e430ee | |||
| 1221b28eac | |||
| 746f603b20 | |||
| 2afea72d29 | |||
| 0f111ab794 | |||
| 4dd7aaa06f | |||
| d27e996ccd | |||
| 72780ff5b1 | |||
| 69fdb8720f | |||
| b2140a895b | |||
| e0e8c58f64 | |||
| cbea5d1725 | |||
| a1245c2c61 | |||
| cdda94f412 | |||
| 5b830aa356 | |||
| 9e7bae9881 | |||
| b41ce1e090 | |||
| 95d3748453 | |||
| 44aa9e566d | |||
| fdb05f54ef | |||
| 98ba18ba55 | |||
| 5bb38586a9 | |||
| ec9e88139a |
@@ -39,7 +39,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Diffusers Benchmarking
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
|
||||
BASE_PATH: benchmark_outputs
|
||||
run: |
|
||||
export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
|
||||
|
||||
@@ -90,24 +90,11 @@ jobs:
|
||||
|
||||
- name: Post to a Slack channel
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
# Slack channel id, channel name, or user id to post message.
|
||||
# See also: https://api.slack.com/methods/chat.postMessage#channels
|
||||
channel-id: ${{ env.CI_SLACK_CHANNEL }}
|
||||
# For posting a rich message using Block Kit
|
||||
payload: |
|
||||
{
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
|
||||
"blocks": [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
slack_channel: ${{ env.CI_SLACK_CHANNEL }}
|
||||
title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
|
||||
- name: Nightly PyTorch CUDA checkpoint (pipelines) tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -154,7 +154,7 @@ jobs:
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -211,7 +211,7 @@ jobs:
|
||||
|
||||
- name: Run nightly LoRA tests with PEFT and Torch
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -269,7 +269,7 @@ jobs:
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
@@ -324,7 +324,7 @@ jobs:
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
@@ -390,7 +390,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
--report-log=tests_torch_mps.log \
|
||||
|
||||
@@ -156,7 +156,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m uv pip install peft timm
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Slow PyTorch CUDA checkpoint tests on Ubuntu
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
|
||||
- name: Run slow PyTorch CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -194,7 +194,7 @@ jobs:
|
||||
|
||||
- name: Run slow PEFT CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -243,7 +243,7 @@ jobs:
|
||||
|
||||
- name: Run slow Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
@@ -290,7 +290,7 @@ jobs:
|
||||
|
||||
- name: Run slow ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
@@ -337,7 +337,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
@@ -378,7 +378,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
@@ -423,9 +423,10 @@ jobs:
|
||||
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install timm
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m uv pip install peft timm
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
@@ -23,7 +23,7 @@ concurrency:
|
||||
jobs:
|
||||
run_fast_tests_apple_m1:
|
||||
name: Fast PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
runs-on: macos-13-xlarge
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -59,7 +59,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
|
||||
@@ -25,6 +25,6 @@ jobs:
|
||||
|
||||
- name: Update metadata
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
run: |
|
||||
python utils/update_metadata.py --commit_sha ${{ github.sha }}
|
||||
|
||||
+1
-1
@@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to
|
||||
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
|
||||
Git](https://git-scm.com/book/en/v2) is a very good reference.
|
||||
|
||||
Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)):
|
||||
Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/42f25d601a910dceadaee6c44345896b4cfa9928/setup.py#L270)):
|
||||
|
||||
1. Fork the [repository](https://github.com/huggingface/diffusers) by
|
||||
clicking on the 'Fork' button on the repository's page. This creates a copy of the code
|
||||
|
||||
@@ -305,6 +305,8 @@
|
||||
title: Personalized Image Animator (PIA)
|
||||
- local: api/pipelines/pixart
|
||||
title: PixArt-α
|
||||
- local: api/pipelines/pixart_sigma
|
||||
title: PixArt-Σ
|
||||
- local: api/pipelines/self_attention_guidance
|
||||
title: Self-Attention Guidance
|
||||
- local: api/pipelines/semantic_stable_diffusion
|
||||
|
||||
@@ -31,7 +31,7 @@ Some notes about this pipeline:
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# PixArt-Σ
|
||||
|
||||

|
||||
|
||||
[PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation](https://huggingface.co/papers/2403.04692) is Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*In this paper, we introduce PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts. A key feature of PixArt-Σ is its training efficiency. Leveraging the foundational pre-training of PixArt-α, it evolves from the ‘weaker’ baseline to a ‘stronger’ model via incorporating higher quality data, a process we term “weak-to-strong training”. The advancements in PixArt-Σ are twofold: (1) High-Quality Training Data: PixArt-Σ incorporates superior-quality image data, paired with more precise and detailed image captions. (2) Efficient Token Compression: we propose a novel attention module within the DiT framework that compresses both keys and values, significantly improving efficiency and facilitating ultra-high-resolution image generation. Thanks to these improvements, PixArt-Σ achieves superior image quality and user prompt adherence capabilities with significantly smaller model size (0.6B parameters) than existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD Cascade (5.1B parameters). Moreover, PixArt-Σ’s capability to generate 4K images supports the creation of high-resolution posters and wallpapers, efficiently bolstering the production of highquality visual content in industries such as film and gaming.*
|
||||
|
||||
You can find the original codebase at [PixArt-alpha/PixArt-sigma](https://github.com/PixArt-alpha/PixArt-sigma) and all the available checkpoints at [PixArt-alpha](https://huggingface.co/PixArt-alpha).
|
||||
|
||||
Some notes about this pipeline:
|
||||
|
||||
* It uses a Transformer backbone (instead of a UNet) for denoising. As such it has a similar architecture as [DiT](https://hf.co/docs/transformers/model_doc/dit).
|
||||
* It was trained using text conditions computed from T5. This aspect makes the pipeline better at following complex text prompts with intricate details.
|
||||
* It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-sigma/blob/master/diffusion/data/datasets/utils.py).
|
||||
* It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as PixArt-α, Stable Diffusion XL, Playground V2.0 and DALL-E 3, while being more efficient than them.
|
||||
* It shows the ability of generating super high resolution images, such as 2048px or even 4K.
|
||||
* It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.)
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Inference with under 8GB GPU VRAM
|
||||
|
||||
Run the [`PixArtSigmaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example.
|
||||
|
||||
First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
|
||||
|
||||
```bash
|
||||
pip install -U bitsandbytes
|
||||
```
|
||||
|
||||
Then load the text encoder in 8-bit:
|
||||
|
||||
```python
|
||||
from transformers import T5EncoderModel
|
||||
from diffusers import PixArtSigmaPipeline
|
||||
import torch
|
||||
|
||||
text_encoder = T5EncoderModel.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
subfolder="text_encoder",
|
||||
load_in_8bit=True,
|
||||
device_map="auto",
|
||||
|
||||
)
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
text_encoder=text_encoder,
|
||||
transformer=None,
|
||||
device_map="balanced"
|
||||
)
|
||||
```
|
||||
|
||||
Now, use the `pipe` to encode a prompt:
|
||||
|
||||
```python
|
||||
with torch.no_grad():
|
||||
prompt = "cute cat"
|
||||
prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
|
||||
```
|
||||
|
||||
Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM:
|
||||
|
||||
```python
|
||||
import gc
|
||||
|
||||
def flush():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
del text_encoder
|
||||
del pipe
|
||||
flush()
|
||||
```
|
||||
|
||||
Then compute the latents with the prompt embeddings as inputs:
|
||||
|
||||
```python
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
text_encoder=None,
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
latents = pipe(
|
||||
negative_prompt=None,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_embeds,
|
||||
prompt_attention_mask=prompt_attention_mask,
|
||||
negative_prompt_attention_mask=negative_prompt_attention_mask,
|
||||
num_images_per_prompt=1,
|
||||
output_type="latent",
|
||||
).images
|
||||
|
||||
del pipe.transformer
|
||||
flush()
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
|
||||
|
||||
</Tip>
|
||||
|
||||
Once the latents are computed, pass it off to the VAE to decode into a real image:
|
||||
|
||||
```python
|
||||
with torch.no_grad():
|
||||
image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = pipe.image_processor.postprocess(image, output_type="pil")[0]
|
||||
image.save("cat.png")
|
||||
```
|
||||
|
||||
By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtSigmaPipeline`] with under 8GB GPU VRAM.
|
||||
|
||||

|
||||
|
||||
If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
|
||||
|
||||
</Tip>
|
||||
|
||||
While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
|
||||
|
||||
## PixArtSigmaPipeline
|
||||
|
||||
[[autodoc]] PixArtSigmaPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -12,4 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Video Processor
|
||||
|
||||
The `VideoProcessor` provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
|
||||
The [`VideoProcessor`] provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
|
||||
|
||||
## VideoProcessor
|
||||
|
||||
[[autodoc]] video_processor.VideoProcessor.preprocess_video
|
||||
|
||||
[[autodoc]] video_processor.VideoProcessor.postprocess_video
|
||||
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Speed up inference
|
||||
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
|
||||
> [!TIP]
|
||||
> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
|
||||
|
||||
@@ -19,13 +19,74 @@ The denoising loop of a pipeline can be modified with custom defined functions u
|
||||
|
||||
This guide will demonstrate how callbacks work by a few features you can implement with them.
|
||||
|
||||
## Official callbacks
|
||||
|
||||
We provide a list of callbacks you can plug into an existing pipeline and modify the denoising loop. This is the current list of official callbacks:
|
||||
|
||||
- `SDCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SD 1.5 pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
|
||||
- `SDXLCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SDXL pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
|
||||
- `IPAdapterScaleCutoffCallback`: Disables the IP Adapter after a certain number of steps for all pipelines supporting IP-Adapter.
|
||||
|
||||
> [!TIP]
|
||||
> If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr).
|
||||
|
||||
To set up a callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments
|
||||
|
||||
- `cutoff_step_ratio`: Float number with the ratio of the steps.
|
||||
- `cutoff_step_index`: Integer number with the exact number of the step.
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
from diffusers import DPMSolverMultistepScheduler, StableDiffusionXLPipeline
|
||||
from diffusers.callbacks import SDXLCFGCutoffCallback
|
||||
|
||||
|
||||
callback = SDXLCFGCutoffCallback(cutoff_step_ratio=0.4)
|
||||
# can also be used with cutoff_step_index
|
||||
# callback = SDXLCFGCutoffCallback(cutoff_step_ratio=None, cutoff_step_index=10)
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
|
||||
|
||||
prompt = "a sports car at the road, best quality, high quality, high detail, 8k resolution"
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(2628670641)
|
||||
|
||||
out = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
guidance_scale=6.5,
|
||||
num_inference_steps=25,
|
||||
generator=generator,
|
||||
callback_on_step_end=callback,
|
||||
)
|
||||
|
||||
out.images[0].save("official_callback.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_cfg_callback.png" alt="generated image of a sports car at the road" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Dynamic classifier-free guidance
|
||||
|
||||
Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:
|
||||
|
||||
* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
|
||||
* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
|
||||
* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
|
||||
- `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
|
||||
- `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
|
||||
- `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
|
||||
|
||||
Your callback function should look something like this:
|
||||
|
||||
|
||||
@@ -981,7 +981,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -1136,7 +1136,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -68,6 +68,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| InstantID Pipeline | Stable Diffusion XL Pipeline that supports InstantID | [InstantID Pipeline](#instantid-pipeline) | [](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
|
||||
| UFOGen Scheduler | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines) | [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
|
||||
| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
|
||||
| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
|
||||
@@ -1676,6 +1677,68 @@ image = pipe(prompt, image=input_image, strength=0.75,).images[0]
|
||||
image.save('tensorrt_img2img_new_zealand_hills.png')
|
||||
```
|
||||
|
||||
### Stable Diffusion BoxDiff
|
||||
BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`.
|
||||
```py
|
||||
import torch
|
||||
from PIL import Image, ImageDraw
|
||||
from copy import deepcopy
|
||||
|
||||
from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline
|
||||
|
||||
def draw_box_with_text(img, boxes, names):
|
||||
colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
|
||||
img_new = deepcopy(img)
|
||||
draw = ImageDraw.Draw(img_new)
|
||||
|
||||
W, H = img.size
|
||||
for bid, box in enumerate(boxes):
|
||||
draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4)
|
||||
draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)])
|
||||
return img_new
|
||||
|
||||
pipe = StableDiffusionBoxDiffPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.to("cuda")
|
||||
|
||||
# example 1
|
||||
prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed"
|
||||
phrases = [
|
||||
"aurora",
|
||||
"reindeer",
|
||||
"meadow",
|
||||
"lake",
|
||||
"mountain"
|
||||
]
|
||||
boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]]
|
||||
|
||||
# example 2
|
||||
# prompt = "A rabbit wearing sunglasses looks very proud"
|
||||
# phrases = ["rabbit", "sunglasses"]
|
||||
# boxes = [[67,87,366,512], [66,130,364,262]]
|
||||
|
||||
boxes = [[x / 512 for x in box] for box in boxes]
|
||||
|
||||
images = pipe(
|
||||
prompt,
|
||||
boxdiff_phrases=phrases,
|
||||
boxdiff_boxes=boxes,
|
||||
boxdiff_kwargs={
|
||||
"attention_res": 16,
|
||||
"normalize_eot": True
|
||||
},
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
generator=torch.manual_seed(42),
|
||||
safety_checker=None
|
||||
).images
|
||||
|
||||
draw_box_with_text(images[0], boxes, phrases).save("output.png")
|
||||
```
|
||||
|
||||
|
||||
### Stable Diffusion Reference
|
||||
|
||||
This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -460,7 +460,7 @@ class StableDiffusionUpscaleLDM3DPipeline(
|
||||
)
|
||||
|
||||
# verify batch size of prompt and image are same if image is a list or tensor or numpy array
|
||||
if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
|
||||
if isinstance(image, (list, np.ndarray, torch.Tensor)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
|
||||
@@ -78,7 +78,7 @@ def torch_dfs(model: torch.nn.Module):
|
||||
class StableDiffusionReferencePipeline(
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
|
||||
):
|
||||
r""" "
|
||||
r"""
|
||||
Pipeline for Stable Diffusion Reference.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
|
||||
@@ -1358,7 +1358,7 @@ def main(args):
|
||||
# estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
|
||||
# solver timestep.
|
||||
with torch.no_grad():
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_teacher_model:
|
||||
autocast_ctx = nullcontext()
|
||||
else:
|
||||
autocast_ctx = torch.autocast(accelerator.device.type)
|
||||
|
||||
@@ -152,7 +152,7 @@ def collate_fn(examples, with_prior_preservation):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -742,7 +742,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
@@ -759,7 +759,7 @@ class PromptDataset(Dataset):
|
||||
|
||||
|
||||
def model_has_vae(args):
|
||||
config_file_name = os.path.join("vae", AutoencoderKL.config_name)
|
||||
config_file_name = Path("vae", AutoencoderKL.config_name).as_posix()
|
||||
if os.path.isdir(args.pretrained_model_name_or_path):
|
||||
config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
|
||||
return os.path.isfile(config_file_name)
|
||||
|
||||
@@ -301,7 +301,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -680,7 +680,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -903,7 +903,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -327,7 +327,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -385,7 +385,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -384,7 +384,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
diffusers==0.20.1
|
||||
accelerate==0.23.0
|
||||
transformers==4.36.0
|
||||
transformers==4.38.0
|
||||
peft==0.5.0
|
||||
torch==2.0.1
|
||||
torchvision>=0.16
|
||||
|
||||
+1
-1
@@ -762,7 +762,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
+1
-1
@@ -700,7 +700,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
+1
-1
@@ -922,7 +922,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
accelerate>=0.16.0
|
||||
torchvision
|
||||
transformers>=4.25.1
|
||||
datasets
|
||||
datasets>=2.19.1
|
||||
ftfy
|
||||
tensorboard
|
||||
Jinja2
|
||||
peft==0.7.0
|
||||
peft==0.7.0
|
||||
|
||||
@@ -50,7 +50,7 @@ from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel, compute_snr
|
||||
from diffusers.utils import check_min_version, is_wandb_available
|
||||
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
|
||||
from diffusers.utils.torch_utils import is_compiled_module
|
||||
|
||||
|
||||
@@ -58,7 +58,8 @@ from diffusers.utils.torch_utils import is_compiled_module
|
||||
check_min_version("0.28.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
if is_torch_npu_available():
|
||||
torch.npu.config.allow_internal_format = False
|
||||
|
||||
DATASET_NAME_MAPPING = {
|
||||
"lambdalabs/naruto-blip-captions": ("image", "text"),
|
||||
@@ -460,6 +461,9 @@ def parse_args(input_args=None):
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument(
|
||||
"--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
|
||||
)
|
||||
@@ -716,7 +720,12 @@ def main(args):
|
||||
args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
|
||||
)
|
||||
ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
|
||||
|
||||
if args.enable_npu_flash_attention:
|
||||
if is_torch_npu_available():
|
||||
logger.info("npu flash attention enabled.")
|
||||
unet.enable_npu_flash_attention()
|
||||
else:
|
||||
raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
|
||||
if args.enable_xformers_memory_efficient_attention:
|
||||
if is_xformers_available():
|
||||
import xformers
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
## Training an VQGAN VAE
|
||||
VQVAEs were first introduced in [Neural Discrete Representation Learning](https://arxiv.org/abs/1711.00937) and was combined with a GAN in the paper [Taming Transformers for High-Resolution Image Synthesis](https://arxiv.org/abs/2012.09841). The basic idea of a VQVAE is it's a type of a variational auto encoder with tokens as the latent space similar to tokens for LLMs. This script was adapted from a [pr to huggingface's open-muse project](https://github.com/huggingface/open-muse/pull/52) with general code following [lucidrian's implementation of the vqgan training script](https://github.com/lucidrains/muse-maskgit-pytorch/blob/main/muse_maskgit_pytorch/trainers.py) but both of these implementation follow from the [taming transformer repo](https://github.com/CompVis/taming-transformers?tab=readme-ov-file).
|
||||
|
||||
|
||||
Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
**Important**
|
||||
|
||||
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install .
|
||||
```
|
||||
|
||||
Then cd in the example folder and run
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
### Training on CIFAR10
|
||||
|
||||
The command to train a VQGAN model on cifar10 dataset:
|
||||
|
||||
```bash
|
||||
accelerate launch train_vqgan.py \
|
||||
--dataset_name=cifar10 \
|
||||
--image_column=img \
|
||||
--validation_images images/bird.jpg images/car.jpg images/dog.jpg images/frog.jpg \
|
||||
--resolution=128 \
|
||||
--train_batch_size=2 \
|
||||
--gradient_accumulation_steps=8 \
|
||||
--report_to=wandb
|
||||
```
|
||||
|
||||
An example training run is [here](https://wandb.ai/sayakpaul/vqgan-training/runs/0m5kzdfp) by @sayakpaul and a lower scale one [here](https://wandb.ai/dsbuddy27/vqgan-training/runs/eqd6xi4n?nw=nwuserisamu). The validation images can be obtained from [here](https://huggingface.co/datasets/diffusers/docs-images/tree/main/vqgan_validation_images).
|
||||
The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocaburary size which at most can be around 16384. How to do this is shown below.
|
||||
|
||||
# Modifying the architecture
|
||||
|
||||
To modify the architecture of the vqgan model you can save the config taken from [here](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder/blob/main/movq/config.json) and then provide that to the script with the option --model_config_name_or_path. This config is below
|
||||
```
|
||||
{
|
||||
"_class_name": "VQModel",
|
||||
"_diffusers_version": "0.17.0.dev0",
|
||||
"act_fn": "silu",
|
||||
"block_out_channels": [
|
||||
128,
|
||||
256,
|
||||
256,
|
||||
512
|
||||
],
|
||||
"down_block_types": [
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
"AttnDownEncoderBlock2D"
|
||||
],
|
||||
"in_channels": 3,
|
||||
"latent_channels": 4,
|
||||
"layers_per_block": 2,
|
||||
"norm_num_groups": 32,
|
||||
"norm_type": "spatial",
|
||||
"num_vq_embeddings": 16384,
|
||||
"out_channels": 3,
|
||||
"sample_size": 32,
|
||||
"scaling_factor": 0.18215,
|
||||
"up_block_types": [
|
||||
"AttnUpDecoderBlock2D",
|
||||
"UpDecoderBlock2D",
|
||||
"UpDecoderBlock2D",
|
||||
"UpDecoderBlock2D"
|
||||
],
|
||||
"vq_embed_dim": 4
|
||||
}
|
||||
```
|
||||
To lower the amount of layers in a VQGan, you can remove layers by modifying the block_out_channels, down_block_types, and up_block_types like below
|
||||
```
|
||||
{
|
||||
"_class_name": "VQModel",
|
||||
"_diffusers_version": "0.17.0.dev0",
|
||||
"act_fn": "silu",
|
||||
"block_out_channels": [
|
||||
128,
|
||||
256,
|
||||
256,
|
||||
],
|
||||
"down_block_types": [
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
"DownEncoderBlock2D",
|
||||
],
|
||||
"in_channels": 3,
|
||||
"latent_channels": 4,
|
||||
"layers_per_block": 2,
|
||||
"norm_num_groups": 32,
|
||||
"norm_type": "spatial",
|
||||
"num_vq_embeddings": 16384,
|
||||
"out_channels": 3,
|
||||
"sample_size": 32,
|
||||
"scaling_factor": 0.18215,
|
||||
"up_block_types": [
|
||||
"UpDecoderBlock2D",
|
||||
"UpDecoderBlock2D",
|
||||
"UpDecoderBlock2D"
|
||||
],
|
||||
"vq_embed_dim": 4
|
||||
}
|
||||
```
|
||||
For increasing the size of the vocaburaries you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that.
|
||||
|
||||
## Extra training tips/ideas
|
||||
During logging take care to make sure data_time is low. data_time is the amount spent loading the data and where the GPU is not active. So essentially, it's the time wasted. The easiest way to lower data time is to increase the --dataloader_num_workers to a higher number like 4. Due to a bug in Pytorch, this only works on linux based systems. For more details check [here](https://github.com/huggingface/diffusers/issues/7646)
|
||||
Secondly, training should seem to be done when both the discriminator and the generator loss converges.
|
||||
Thirdly, another low hanging fruit is just using ema using the --use_ema parameter. This tends to make the output images smoother. This has a con where you have to lower your batch size by 1 but it may be worth it.
|
||||
Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, becareful with the feature map norms since this can easily overdominate the loss.
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Ported from Paella
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
||||
from diffusers.models.modeling_utils import ModelMixin
|
||||
|
||||
|
||||
# Discriminator model ported from Paella https://github.com/dome272/Paella/blob/main/src_distributed/vqgan.py
|
||||
class Discriminator(ModelMixin, ConfigMixin):
|
||||
@register_to_config
|
||||
def __init__(self, in_channels=3, cond_channels=0, hidden_channels=512, depth=6):
|
||||
super().__init__()
|
||||
d = max(depth - 3, 3)
|
||||
layers = [
|
||||
nn.utils.spectral_norm(
|
||||
nn.Conv2d(in_channels, hidden_channels // (2**d), kernel_size=3, stride=2, padding=1)
|
||||
),
|
||||
nn.LeakyReLU(0.2),
|
||||
]
|
||||
for i in range(depth - 1):
|
||||
c_in = hidden_channels // (2 ** max((d - i), 0))
|
||||
c_out = hidden_channels // (2 ** max((d - 1 - i), 0))
|
||||
layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
|
||||
layers.append(nn.InstanceNorm2d(c_out))
|
||||
layers.append(nn.LeakyReLU(0.2))
|
||||
self.encoder = nn.Sequential(*layers)
|
||||
self.shuffle = nn.Conv2d(
|
||||
(hidden_channels + cond_channels) if cond_channels > 0 else hidden_channels, 1, kernel_size=1
|
||||
)
|
||||
self.logits = nn.Sigmoid()
|
||||
|
||||
def forward(self, x, cond=None):
|
||||
x = self.encoder(x)
|
||||
if cond is not None:
|
||||
cond = cond.view(
|
||||
cond.size(0),
|
||||
cond.size(1),
|
||||
1,
|
||||
1,
|
||||
).expand(-1, -1, x.size(-2), x.size(-1))
|
||||
x = torch.cat([x, cond], dim=1)
|
||||
x = self.shuffle(x)
|
||||
x = self.logits(x)
|
||||
return x
|
||||
@@ -0,0 +1,8 @@
|
||||
accelerate>=0.16.0
|
||||
torchvision
|
||||
transformers>=4.25.1
|
||||
datasets
|
||||
timm
|
||||
numpy
|
||||
tqdm
|
||||
tensorboard
|
||||
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import VQModel
|
||||
from diffusers.utils.testing_utils import require_timm
|
||||
|
||||
|
||||
sys.path.append("..")
|
||||
from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger()
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
@require_timm
|
||||
class TextToImage(ExamplesTestsAccelerate):
|
||||
@property
|
||||
def test_vqmodel_config(self):
|
||||
return {
|
||||
"_class_name": "VQModel",
|
||||
"_diffusers_version": "0.17.0.dev0",
|
||||
"act_fn": "silu",
|
||||
"block_out_channels": [
|
||||
32,
|
||||
],
|
||||
"down_block_types": [
|
||||
"DownEncoderBlock2D",
|
||||
],
|
||||
"in_channels": 3,
|
||||
"latent_channels": 4,
|
||||
"layers_per_block": 2,
|
||||
"norm_num_groups": 32,
|
||||
"norm_type": "spatial",
|
||||
"num_vq_embeddings": 32,
|
||||
"out_channels": 3,
|
||||
"sample_size": 32,
|
||||
"scaling_factor": 0.18215,
|
||||
"up_block_types": [
|
||||
"UpDecoderBlock2D",
|
||||
],
|
||||
"vq_embed_dim": 4,
|
||||
}
|
||||
|
||||
@property
|
||||
def test_discriminator_config(self):
|
||||
return {
|
||||
"_class_name": "Discriminator",
|
||||
"_diffusers_version": "0.27.0.dev0",
|
||||
"in_channels": 3,
|
||||
"cond_channels": 0,
|
||||
"hidden_channels": 8,
|
||||
"depth": 4,
|
||||
}
|
||||
|
||||
def get_vq_and_discriminator_configs(self, tmpdir):
|
||||
vqmodel_config_path = os.path.join(tmpdir, "vqmodel.json")
|
||||
discriminator_config_path = os.path.join(tmpdir, "discriminator.json")
|
||||
with open(vqmodel_config_path, "w") as fp:
|
||||
json.dump(self.test_vqmodel_config, fp)
|
||||
with open(discriminator_config_path, "w") as fp:
|
||||
json.dump(self.test_discriminator_config, fp)
|
||||
return vqmodel_config_path, discriminator_config_path
|
||||
|
||||
def test_vqmodel(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
|
||||
test_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 2
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--output_dir {tmpdir}
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + test_args)
|
||||
# save_pretrained smoke test
|
||||
self.assertTrue(
|
||||
os.path.isfile(os.path.join(tmpdir, "discriminator", "diffusion_pytorch_model.safetensors"))
|
||||
)
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "vqmodel", "diffusion_pytorch_model.safetensors")))
|
||||
|
||||
def test_vqmodel_checkpointing(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
|
||||
# Run training script with checkpointing
|
||||
# max_train_steps == 4, checkpointing_steps == 2
|
||||
# Should create checkpoints at steps 2, 4
|
||||
|
||||
initial_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 4
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--checkpointing_steps=2
|
||||
--output_dir {tmpdir}
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + initial_run_args)
|
||||
|
||||
# check checkpoint directories exist
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-2", "checkpoint-4"},
|
||||
)
|
||||
|
||||
# check can run an intermediate checkpoint
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
|
||||
shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-4"},
|
||||
)
|
||||
|
||||
# Run training script for 2 total steps resuming from checkpoint 4
|
||||
|
||||
resume_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 6
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--checkpointing_steps=1
|
||||
--resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
|
||||
--output_dir {tmpdir}
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + resume_run_args)
|
||||
|
||||
# check can run new fully trained pipeline
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# no checkpoint-2 -> check old checkpoints do not exist
|
||||
# check new checkpoints exist
|
||||
# In the current script, checkpointing_steps 1 is equivalent to checkpointing_steps 2 as after the generator gets trained for one step,
|
||||
# the discriminator gets trained and loss and saving happens after that. Thus we do not expect to get a checkpoint-5
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-4", "checkpoint-6"},
|
||||
)
|
||||
|
||||
def test_vqmodel_checkpointing_use_ema(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
|
||||
# Run training script with checkpointing
|
||||
# max_train_steps == 4, checkpointing_steps == 2
|
||||
# Should create checkpoints at steps 2, 4
|
||||
|
||||
initial_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 4
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--checkpointing_steps=2
|
||||
--output_dir {tmpdir}
|
||||
--use_ema
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + initial_run_args)
|
||||
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# check checkpoint directories exist
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-2", "checkpoint-4"},
|
||||
)
|
||||
|
||||
# check can run an intermediate checkpoint
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
|
||||
shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
|
||||
|
||||
# Run training script for 2 total steps resuming from checkpoint 4
|
||||
|
||||
resume_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 6
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--checkpointing_steps=1
|
||||
--resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
|
||||
--output_dir {tmpdir}
|
||||
--use_ema
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + resume_run_args)
|
||||
|
||||
# check can run new fully trained pipeline
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# no checkpoint-2 -> check old checkpoints do not exist
|
||||
# check new checkpoints exist
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-4", "checkpoint-6"},
|
||||
)
|
||||
|
||||
def test_vqmodel_checkpointing_checkpoints_total_limit(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
|
||||
# Run training script with checkpointing
|
||||
# max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
|
||||
# Should create checkpoints at steps 2, 4, 6
|
||||
# with checkpoint at step 2 deleted
|
||||
|
||||
initial_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 6
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--output_dir {tmpdir}
|
||||
--checkpointing_steps=2
|
||||
--checkpoints_total_limit=2
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + initial_run_args)
|
||||
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# check checkpoint directories exist
|
||||
# checkpoint-2 should have been deleted
|
||||
self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
|
||||
|
||||
def test_vqmodel_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
|
||||
# Run training script with checkpointing
|
||||
# max_train_steps == 4, checkpointing_steps == 2
|
||||
# Should create checkpoints at steps 2, 4
|
||||
|
||||
initial_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 4
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--checkpointing_steps=2
|
||||
--output_dir {tmpdir}
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + initial_run_args)
|
||||
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# check checkpoint directories exist
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-2", "checkpoint-4"},
|
||||
)
|
||||
|
||||
# resume and we should try to checkpoint at 6, where we'll have to remove
|
||||
# checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
|
||||
|
||||
resume_run_args = f"""
|
||||
examples/vqgan/train_vqgan.py
|
||||
--dataset_name hf-internal-testing/dummy_image_text_data
|
||||
--resolution 32
|
||||
--image_column image
|
||||
--train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--max_train_steps 8
|
||||
--learning_rate 5.0e-04
|
||||
--scale_lr
|
||||
--lr_scheduler constant
|
||||
--lr_warmup_steps 0
|
||||
--model_config_name_or_path {vqmodel_config_path}
|
||||
--discriminator_config_name_or_path {discriminator_config_path}
|
||||
--output_dir {tmpdir}
|
||||
--checkpointing_steps=2
|
||||
--resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
|
||||
--checkpoints_total_limit=2
|
||||
--seed=0
|
||||
""".split()
|
||||
|
||||
run_command(self._launch_args + resume_run_args)
|
||||
|
||||
model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
_ = model(image)
|
||||
|
||||
# check checkpoint directories exist
|
||||
self.assertEqual(
|
||||
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
|
||||
{"checkpoint-6", "checkpoint-8"},
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,156 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .configuration_utils import ConfigMixin, register_to_config
|
||||
from .utils import CONFIG_NAME
|
||||
|
||||
|
||||
class PipelineCallback(ConfigMixin):
|
||||
"""
|
||||
Base class for all the official callbacks used in a pipeline. This class provides a structure for implementing
|
||||
custom callbacks and ensures that all callbacks have a consistent interface.
|
||||
|
||||
Please implement the following:
|
||||
`tensor_inputs`: This should return a list of tensor inputs specific to your callback. You will only be able to
|
||||
include
|
||||
variables listed in the `._callback_tensor_inputs` attribute of your pipeline class.
|
||||
`callback_fn`: This method defines the core functionality of your callback.
|
||||
"""
|
||||
|
||||
config_name = CONFIG_NAME
|
||||
|
||||
@register_to_config
|
||||
def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None):
|
||||
super().__init__()
|
||||
|
||||
if (cutoff_step_ratio is None and cutoff_step_index is None) or (
|
||||
cutoff_step_ratio is not None and cutoff_step_index is not None
|
||||
):
|
||||
raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.")
|
||||
|
||||
if cutoff_step_ratio is not None and (
|
||||
not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0)
|
||||
):
|
||||
raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.")
|
||||
|
||||
@property
|
||||
def tensor_inputs(self) -> List[str]:
|
||||
raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}")
|
||||
|
||||
def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]:
|
||||
raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}")
|
||||
|
||||
def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
|
||||
return self.callback_fn(pipeline, step_index, timestep, callback_kwargs)
|
||||
|
||||
|
||||
class MultiPipelineCallbacks:
|
||||
"""
|
||||
This class is designed to handle multiple pipeline callbacks. It accepts a list of PipelineCallback objects and
|
||||
provides a unified interface for calling all of them.
|
||||
"""
|
||||
|
||||
def __init__(self, callbacks: List[PipelineCallback]):
|
||||
self.callbacks = callbacks
|
||||
|
||||
@property
|
||||
def tensor_inputs(self) -> List[str]:
|
||||
return [input for callback in self.callbacks for input in callback.tensor_inputs]
|
||||
|
||||
def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Calls all the callbacks in order with the given arguments and returns the final callback_kwargs.
|
||||
"""
|
||||
for callback in self.callbacks:
|
||||
callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs)
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
|
||||
class SDCFGCutoffCallback(PipelineCallback):
|
||||
"""
|
||||
Callback function for Stable Diffusion Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
|
||||
`cutoff_step_index`), this callback will disable the CFG.
|
||||
|
||||
Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
|
||||
"""
|
||||
|
||||
tensor_inputs = ["prompt_embeds"]
|
||||
|
||||
def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
|
||||
cutoff_step_ratio = self.config.cutoff_step_ratio
|
||||
cutoff_step_index = self.config.cutoff_step_index
|
||||
|
||||
# Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
|
||||
cutoff_step = (
|
||||
cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
|
||||
)
|
||||
|
||||
if step_index == cutoff_step:
|
||||
prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
|
||||
prompt_embeds = prompt_embeds[-1:] # "-1" denotes the embeddings for conditional text tokens.
|
||||
|
||||
pipeline._guidance_scale = 0.0
|
||||
|
||||
callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
|
||||
return callback_kwargs
|
||||
|
||||
|
||||
class SDXLCFGCutoffCallback(PipelineCallback):
|
||||
"""
|
||||
Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
|
||||
`cutoff_step_index`), this callback will disable the CFG.
|
||||
|
||||
Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
|
||||
"""
|
||||
|
||||
tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
|
||||
|
||||
def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
|
||||
cutoff_step_ratio = self.config.cutoff_step_ratio
|
||||
cutoff_step_index = self.config.cutoff_step_index
|
||||
|
||||
# Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
|
||||
cutoff_step = (
|
||||
cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
|
||||
)
|
||||
|
||||
if step_index == cutoff_step:
|
||||
prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
|
||||
prompt_embeds = prompt_embeds[-1:] # "-1" denotes the embeddings for conditional text tokens.
|
||||
|
||||
add_text_embeds = callback_kwargs[self.tensor_inputs[1]]
|
||||
add_text_embeds = add_text_embeds[-1:] # "-1" denotes the embeddings for conditional pooled text tokens
|
||||
|
||||
add_time_ids = callback_kwargs[self.tensor_inputs[2]]
|
||||
add_time_ids = add_time_ids[-1:] # "-1" denotes the embeddings for conditional added time vector
|
||||
|
||||
pipeline._guidance_scale = 0.0
|
||||
|
||||
callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
|
||||
callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
|
||||
callback_kwargs[self.tensor_inputs[2]] = add_time_ids
|
||||
return callback_kwargs
|
||||
|
||||
|
||||
class IPAdapterScaleCutoffCallback(PipelineCallback):
|
||||
"""
|
||||
Callback function for any pipeline that inherits `IPAdapterMixin`. After certain number of steps (set by
|
||||
`cutoff_step_ratio` or `cutoff_step_index`), this callback will set the IP Adapter scale to `0.0`.
|
||||
|
||||
Note: This callback mutates the IP Adapter attention processors by setting the scale to 0.0 after the cutoff step.
|
||||
"""
|
||||
|
||||
tensor_inputs = []
|
||||
|
||||
def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
|
||||
cutoff_step_ratio = self.config.cutoff_step_ratio
|
||||
cutoff_step_index = self.config.cutoff_step_index
|
||||
|
||||
# Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
|
||||
cutoff_step = (
|
||||
cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
|
||||
)
|
||||
|
||||
if step_index == cutoff_step:
|
||||
pipeline.set_ip_adapter_scale(0.0)
|
||||
return callback_kwargs
|
||||
@@ -13,12 +13,25 @@
|
||||
# limitations under the License.
|
||||
|
||||
import platform
|
||||
import subprocess
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import huggingface_hub
|
||||
|
||||
from .. import __version__ as version
|
||||
from ..utils import is_accelerate_available, is_torch_available, is_transformers_available, is_xformers_available
|
||||
from ..utils import (
|
||||
is_accelerate_available,
|
||||
is_bitsandbytes_available,
|
||||
is_flax_available,
|
||||
is_google_colab,
|
||||
is_notebook,
|
||||
is_peft_available,
|
||||
is_safetensors_available,
|
||||
is_torch_available,
|
||||
is_transformers_available,
|
||||
is_xformers_available,
|
||||
)
|
||||
from ..utils.testing_utils import get_python_version
|
||||
from . import BaseDiffusersCLICommand
|
||||
|
||||
|
||||
@@ -28,13 +41,19 @@ def info_command_factory(_):
|
||||
|
||||
class EnvironmentCommand(BaseDiffusersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
def register_subcommand(parser: ArgumentParser) -> None:
|
||||
download_parser = parser.add_parser("env")
|
||||
download_parser.set_defaults(func=info_command_factory)
|
||||
|
||||
def run(self):
|
||||
def run(self) -> dict:
|
||||
hub_version = huggingface_hub.__version__
|
||||
|
||||
safetensors_version = "not installed"
|
||||
if is_safetensors_available():
|
||||
import safetensors
|
||||
|
||||
safetensors_version = safetensors.__version__
|
||||
|
||||
pt_version = "not installed"
|
||||
pt_cuda_available = "NA"
|
||||
if is_torch_available():
|
||||
@@ -43,6 +62,20 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
|
||||
pt_version = torch.__version__
|
||||
pt_cuda_available = torch.cuda.is_available()
|
||||
|
||||
flax_version = "not installed"
|
||||
jax_version = "not installed"
|
||||
jaxlib_version = "not installed"
|
||||
jax_backend = "NA"
|
||||
if is_flax_available():
|
||||
import flax
|
||||
import jax
|
||||
import jaxlib
|
||||
|
||||
flax_version = flax.__version__
|
||||
jax_version = jax.__version__
|
||||
jaxlib_version = jaxlib.__version__
|
||||
jax_backend = jax.lib.xla_bridge.get_backend().platform
|
||||
|
||||
transformers_version = "not installed"
|
||||
if is_transformers_available():
|
||||
import transformers
|
||||
@@ -55,21 +88,92 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
|
||||
|
||||
accelerate_version = accelerate.__version__
|
||||
|
||||
peft_version = "not installed"
|
||||
if is_peft_available():
|
||||
import peft
|
||||
|
||||
peft_version = peft.__version__
|
||||
|
||||
bitsandbytes_version = "not installed"
|
||||
if is_bitsandbytes_available():
|
||||
import bitsandbytes
|
||||
|
||||
bitsandbytes_version = bitsandbytes.__version__
|
||||
|
||||
xformers_version = "not installed"
|
||||
if is_xformers_available():
|
||||
import xformers
|
||||
|
||||
xformers_version = xformers.__version__
|
||||
|
||||
if get_python_version() >= (3, 10):
|
||||
platform_info = f"{platform.freedesktop_os_release().get('PRETTY_NAME', None)} - {platform.platform()}"
|
||||
else:
|
||||
platform_info = platform.platform()
|
||||
|
||||
is_notebook_str = "Yes" if is_notebook() else "No"
|
||||
|
||||
is_google_colab_str = "Yes" if is_google_colab() else "No"
|
||||
|
||||
accelerator = "NA"
|
||||
if platform.system() in {"Linux", "Windows"}:
|
||||
try:
|
||||
sp = subprocess.Popen(
|
||||
["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
out_str, _ = sp.communicate()
|
||||
out_str = out_str.decode("utf-8")
|
||||
|
||||
if len(out_str) > 0:
|
||||
accelerator = out_str.strip() + " VRAM"
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
elif platform.system() == "Darwin": # Mac OS
|
||||
try:
|
||||
sp = subprocess.Popen(
|
||||
["system_profiler", "SPDisplaysDataType"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
out_str, _ = sp.communicate()
|
||||
out_str = out_str.decode("utf-8")
|
||||
|
||||
start = out_str.find("Chipset Model:")
|
||||
if start != -1:
|
||||
start += len("Chipset Model:")
|
||||
end = out_str.find("\n", start)
|
||||
accelerator = out_str[start:end].strip()
|
||||
|
||||
start = out_str.find("VRAM (Total):")
|
||||
if start != -1:
|
||||
start += len("VRAM (Total):")
|
||||
end = out_str.find("\n", start)
|
||||
accelerator += " VRAM: " + out_str[start:end].strip()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
print("It seems you are running an unusual OS. Could you fill in the accelerator manually?")
|
||||
|
||||
info = {
|
||||
"`diffusers` version": version,
|
||||
"Platform": platform.platform(),
|
||||
"🤗 Diffusers version": version,
|
||||
"Platform": platform_info,
|
||||
"Running on a notebook?": is_notebook_str,
|
||||
"Running on Google Colab?": is_google_colab_str,
|
||||
"Python version": platform.python_version(),
|
||||
"PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
|
||||
"Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
|
||||
"Jax version": jax_version,
|
||||
"JaxLib version": jaxlib_version,
|
||||
"Huggingface_hub version": hub_version,
|
||||
"Transformers version": transformers_version,
|
||||
"Accelerate version": accelerate_version,
|
||||
"PEFT version": peft_version,
|
||||
"Bitsandbytes version": bitsandbytes_version,
|
||||
"Safetensors version": safetensors_version,
|
||||
"xFormers version": xformers_version,
|
||||
"Accelerator": accelerator,
|
||||
"Using GPU in script?": "<fill in>",
|
||||
"Using distributed or parallel set-up in script?": "<fill in>",
|
||||
}
|
||||
@@ -80,5 +184,5 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
|
||||
return info
|
||||
|
||||
@staticmethod
|
||||
def format_dict(d):
|
||||
def format_dict(d: dict) -> str:
|
||||
return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
|
||||
|
||||
@@ -363,7 +363,7 @@ class LoraLoaderMixin:
|
||||
is_model_cpu_offload = False
|
||||
is_sequential_cpu_offload = False
|
||||
|
||||
if _pipeline is not None:
|
||||
if _pipeline is not None and _pipeline.hf_device_map is None:
|
||||
for _, component in _pipeline.components.items():
|
||||
if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
|
||||
if not is_model_cpu_offload:
|
||||
|
||||
@@ -826,8 +826,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs)
|
||||
|
||||
# at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
|
||||
if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
|
||||
logger.warninging("Checkpoint has both EMA and non-EMA weights.")
|
||||
logger.warninging(
|
||||
logger.warning("Checkpoint has both EMA and non-EMA weights.")
|
||||
logger.warning(
|
||||
"In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
|
||||
" weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
|
||||
)
|
||||
@@ -837,7 +837,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs)
|
||||
unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(flat_ema_key)
|
||||
else:
|
||||
if sum(k.startswith("model_ema") for k in keys) > 100:
|
||||
logger.warninging(
|
||||
logger.warning(
|
||||
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
|
||||
" weights (usually better for inference), please make sure to add the `--extract_ema` flag."
|
||||
)
|
||||
|
||||
@@ -419,19 +419,20 @@ class TextualInversionLoaderMixin:
|
||||
# 7.1 Offload all hooks in case the pipeline was cpu offloaded before make sure, we offload and onload again
|
||||
is_model_cpu_offload = False
|
||||
is_sequential_cpu_offload = False
|
||||
for _, component in self.components.items():
|
||||
if isinstance(component, nn.Module):
|
||||
if hasattr(component, "_hf_hook"):
|
||||
is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
|
||||
is_sequential_cpu_offload = (
|
||||
isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
|
||||
or hasattr(component._hf_hook, "hooks")
|
||||
and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
|
||||
)
|
||||
logger.info(
|
||||
"Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
|
||||
)
|
||||
remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
|
||||
if self.hf_device_map is None:
|
||||
for _, component in self.components.items():
|
||||
if isinstance(component, nn.Module):
|
||||
if hasattr(component, "_hf_hook"):
|
||||
is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
|
||||
is_sequential_cpu_offload = (
|
||||
isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
|
||||
or hasattr(component._hf_hook, "hooks")
|
||||
and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
|
||||
)
|
||||
logger.info(
|
||||
"Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
|
||||
)
|
||||
remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
|
||||
|
||||
# 7.2 save expected device and dtype
|
||||
device = text_encoder.device
|
||||
|
||||
@@ -300,7 +300,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
|
||||
decoded = torch.cat(decoded_slices)
|
||||
else:
|
||||
decoded = self._decode(z).sample
|
||||
decoded = self._decode(z, return_dict=False)[0]
|
||||
|
||||
if not return_dict:
|
||||
return (decoded,)
|
||||
|
||||
@@ -41,6 +41,7 @@ class DecoderOutput(BaseOutput):
|
||||
"""
|
||||
|
||||
sample: torch.Tensor
|
||||
commit_loss: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import safetensors
|
||||
import torch
|
||||
|
||||
from ..utils import (
|
||||
SAFETENSORS_FILE_EXTENSION,
|
||||
is_accelerate_available,
|
||||
is_torch_version,
|
||||
logging,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import infer_auto_device_map
|
||||
from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
|
||||
|
||||
|
||||
# Adapted from `transformers` (see modeling_utils.py)
|
||||
def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_dtype):
|
||||
if isinstance(device_map, str):
|
||||
no_split_modules = model._get_no_split_modules(device_map)
|
||||
device_map_kwargs = {"no_split_module_classes": no_split_modules}
|
||||
|
||||
if device_map != "sequential":
|
||||
max_memory = get_balanced_memory(
|
||||
model,
|
||||
dtype=torch_dtype,
|
||||
low_zero=(device_map == "balanced_low_0"),
|
||||
max_memory=max_memory,
|
||||
**device_map_kwargs,
|
||||
)
|
||||
else:
|
||||
max_memory = get_max_memory(max_memory)
|
||||
|
||||
device_map_kwargs["max_memory"] = max_memory
|
||||
device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
|
||||
"""
|
||||
Reads a checkpoint file, returning properly formatted errors if they arise.
|
||||
"""
|
||||
try:
|
||||
file_extension = os.path.basename(checkpoint_file).split(".")[-1]
|
||||
if file_extension == SAFETENSORS_FILE_EXTENSION:
|
||||
return safetensors.torch.load_file(checkpoint_file, device="cpu")
|
||||
else:
|
||||
weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
|
||||
return torch.load(
|
||||
checkpoint_file,
|
||||
map_location="cpu",
|
||||
**weights_only_kwarg,
|
||||
)
|
||||
except Exception as e:
|
||||
try:
|
||||
with open(checkpoint_file) as f:
|
||||
if f.read().startswith("version"):
|
||||
raise OSError(
|
||||
"You seem to have cloned a repository without having git-lfs installed. Please install "
|
||||
"git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
|
||||
"you cloned."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
|
||||
"model. Make sure you have saved the model properly."
|
||||
) from e
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
raise OSError(
|
||||
f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
|
||||
)
|
||||
|
||||
|
||||
def load_model_dict_into_meta(
|
||||
model,
|
||||
state_dict: OrderedDict,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
model_name_or_path: Optional[str] = None,
|
||||
) -> List[str]:
|
||||
device = device or torch.device("cpu")
|
||||
dtype = dtype or torch.float32
|
||||
|
||||
accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
|
||||
|
||||
unexpected_keys = []
|
||||
empty_state_dict = model.state_dict()
|
||||
for param_name, param in state_dict.items():
|
||||
if param_name not in empty_state_dict:
|
||||
unexpected_keys.append(param_name)
|
||||
continue
|
||||
|
||||
if empty_state_dict[param_name].shape != param.shape:
|
||||
model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
|
||||
raise ValueError(
|
||||
f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
|
||||
)
|
||||
|
||||
if accepts_dtype:
|
||||
set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
|
||||
else:
|
||||
set_module_tensor_to_device(model, param_name, device, value=param)
|
||||
return unexpected_keys
|
||||
|
||||
|
||||
def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
|
||||
# Convert old format to new format if needed from a PyTorch state_dict
|
||||
# copy state_dict so _load_from_state_dict can modify it
|
||||
state_dict = state_dict.copy()
|
||||
error_msgs = []
|
||||
|
||||
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
|
||||
# so we need to apply the function recursively.
|
||||
def load(module: torch.nn.Module, prefix: str = ""):
|
||||
args = (state_dict, prefix, {}, True, [], [], error_msgs)
|
||||
module._load_from_state_dict(*args)
|
||||
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + ".")
|
||||
|
||||
load(model_to_load)
|
||||
|
||||
return error_msgs
|
||||
@@ -33,7 +33,6 @@ from .. import __version__
|
||||
from ..utils import (
|
||||
CONFIG_NAME,
|
||||
FLAX_WEIGHTS_NAME,
|
||||
SAFETENSORS_FILE_EXTENSION,
|
||||
SAFETENSORS_WEIGHTS_NAME,
|
||||
WEIGHTS_NAME,
|
||||
_add_variant,
|
||||
@@ -44,6 +43,12 @@ from ..utils import (
|
||||
logging,
|
||||
)
|
||||
from ..utils.hub_utils import PushToHubMixin, load_or_create_model_card, populate_model_card
|
||||
from .model_loading_utils import (
|
||||
_determine_device_map,
|
||||
_load_state_dict_into_model,
|
||||
load_model_dict_into_meta,
|
||||
load_state_dict,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -57,9 +62,6 @@ else:
|
||||
|
||||
if is_accelerate_available():
|
||||
import accelerate
|
||||
from accelerate import infer_auto_device_map
|
||||
from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
|
||||
from accelerate.utils.versions import is_torch_version
|
||||
|
||||
|
||||
def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
|
||||
@@ -100,117 +102,6 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
|
||||
return first_tuple[1].dtype
|
||||
|
||||
|
||||
# Adapted from `transformers` (see modeling_utils.py)
|
||||
def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype):
|
||||
if isinstance(device_map, str):
|
||||
no_split_modules = model._get_no_split_modules(device_map)
|
||||
device_map_kwargs = {"no_split_module_classes": no_split_modules}
|
||||
|
||||
if device_map != "sequential":
|
||||
max_memory = get_balanced_memory(
|
||||
model,
|
||||
dtype=torch_dtype,
|
||||
low_zero=(device_map == "balanced_low_0"),
|
||||
max_memory=max_memory,
|
||||
**device_map_kwargs,
|
||||
)
|
||||
else:
|
||||
max_memory = get_max_memory(max_memory)
|
||||
|
||||
device_map_kwargs["max_memory"] = max_memory
|
||||
device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
|
||||
"""
|
||||
Reads a checkpoint file, returning properly formatted errors if they arise.
|
||||
"""
|
||||
try:
|
||||
file_extension = os.path.basename(checkpoint_file).split(".")[-1]
|
||||
if file_extension == SAFETENSORS_FILE_EXTENSION:
|
||||
return safetensors.torch.load_file(checkpoint_file, device="cpu")
|
||||
else:
|
||||
weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
|
||||
return torch.load(
|
||||
checkpoint_file,
|
||||
map_location="cpu",
|
||||
**weights_only_kwarg,
|
||||
)
|
||||
except Exception as e:
|
||||
try:
|
||||
with open(checkpoint_file) as f:
|
||||
if f.read().startswith("version"):
|
||||
raise OSError(
|
||||
"You seem to have cloned a repository without having git-lfs installed. Please install "
|
||||
"git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
|
||||
"you cloned."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
|
||||
"model. Make sure you have saved the model properly."
|
||||
) from e
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
raise OSError(
|
||||
f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
|
||||
)
|
||||
|
||||
|
||||
def load_model_dict_into_meta(
|
||||
model,
|
||||
state_dict: OrderedDict,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
model_name_or_path: Optional[str] = None,
|
||||
) -> List[str]:
|
||||
device = device or torch.device("cpu")
|
||||
dtype = dtype or torch.float32
|
||||
|
||||
accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
|
||||
|
||||
unexpected_keys = []
|
||||
empty_state_dict = model.state_dict()
|
||||
for param_name, param in state_dict.items():
|
||||
if param_name not in empty_state_dict:
|
||||
unexpected_keys.append(param_name)
|
||||
continue
|
||||
|
||||
if empty_state_dict[param_name].shape != param.shape:
|
||||
model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
|
||||
raise ValueError(
|
||||
f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
|
||||
)
|
||||
|
||||
if accepts_dtype:
|
||||
set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
|
||||
else:
|
||||
set_module_tensor_to_device(model, param_name, device, value=param)
|
||||
return unexpected_keys
|
||||
|
||||
|
||||
def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
|
||||
# Convert old format to new format if needed from a PyTorch state_dict
|
||||
# copy state_dict so _load_from_state_dict can modify it
|
||||
state_dict = state_dict.copy()
|
||||
error_msgs = []
|
||||
|
||||
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
|
||||
# so we need to apply the function recursively.
|
||||
def load(module: torch.nn.Module, prefix: str = ""):
|
||||
args = (state_dict, prefix, {}, True, [], [], error_msgs)
|
||||
module._load_from_state_dict(*args)
|
||||
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + ".")
|
||||
|
||||
load(model_to_load)
|
||||
|
||||
return error_msgs
|
||||
|
||||
|
||||
class ModelMixin(torch.nn.Module, PushToHubMixin):
|
||||
r"""
|
||||
Base class for all models.
|
||||
|
||||
@@ -685,7 +685,7 @@ class UNet2DConditionModel(
|
||||
positive_len = 768
|
||||
if isinstance(cross_attention_dim, int):
|
||||
positive_len = cross_attention_dim
|
||||
elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
|
||||
elif isinstance(cross_attention_dim, (list, tuple)):
|
||||
positive_len = cross_attention_dim[0]
|
||||
|
||||
feature_type = "text-only" if attention_type == "gated" else "text-image"
|
||||
|
||||
@@ -15,6 +15,7 @@ from typing import Any, Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
|
||||
@@ -27,6 +28,9 @@ from ..attention_processor import (
|
||||
AttentionProcessor,
|
||||
AttnAddedKVProcessor,
|
||||
AttnProcessor,
|
||||
AttnProcessor2_0,
|
||||
IPAdapterAttnProcessor,
|
||||
IPAdapterAttnProcessor2_0,
|
||||
)
|
||||
from ..embeddings import TimestepEmbedding, Timesteps
|
||||
from ..modeling_utils import ModelMixin
|
||||
@@ -490,6 +494,36 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
model.time_proj.load_state_dict(unet.time_proj.state_dict())
|
||||
model.time_embedding.load_state_dict(unet.time_embedding.state_dict())
|
||||
|
||||
if any(
|
||||
isinstance(proc, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
|
||||
for proc in unet.attn_processors.values()
|
||||
):
|
||||
attn_procs = {}
|
||||
for name, processor in unet.attn_processors.items():
|
||||
if name.endswith("attn1.processor"):
|
||||
attn_processor_class = (
|
||||
AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
|
||||
)
|
||||
attn_procs[name] = attn_processor_class()
|
||||
else:
|
||||
attn_processor_class = (
|
||||
IPAdapterAttnProcessor2_0
|
||||
if hasattr(F, "scaled_dot_product_attention")
|
||||
else IPAdapterAttnProcessor
|
||||
)
|
||||
attn_procs[name] = attn_processor_class(
|
||||
hidden_size=processor.hidden_size,
|
||||
cross_attention_dim=processor.cross_attention_dim,
|
||||
scale=processor.scale,
|
||||
num_tokens=processor.num_tokens,
|
||||
)
|
||||
for name, processor in model.attn_processors.items():
|
||||
if name not in attn_procs:
|
||||
attn_procs[name] = processor.__class__()
|
||||
model.set_attn_processor(attn_procs)
|
||||
model.config.encoder_hid_dim_type = "ip_image_proj"
|
||||
model.encoder_hid_proj = unet.encoder_hid_proj
|
||||
|
||||
for i, down_block in enumerate(unet.down_blocks):
|
||||
model.down_blocks[i].resnets.load_state_dict(down_block.resnets.state_dict())
|
||||
if hasattr(model.down_blocks[i], "attentions"):
|
||||
|
||||
@@ -142,18 +142,20 @@ class VQModel(ModelMixin, ConfigMixin):
|
||||
) -> Union[DecoderOutput, torch.Tensor]:
|
||||
# also go through quantization layer
|
||||
if not force_not_quantize:
|
||||
quant, _, _ = self.quantize(h)
|
||||
quant, commit_loss, _ = self.quantize(h)
|
||||
elif self.config.lookup_from_codebook:
|
||||
quant = self.quantize.get_codebook_entry(h, shape)
|
||||
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
||||
else:
|
||||
quant = h
|
||||
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
||||
quant2 = self.post_quant_conv(quant)
|
||||
dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
|
||||
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
return dec, commit_loss
|
||||
|
||||
return DecoderOutput(sample=dec)
|
||||
return DecoderOutput(sample=dec, commit_loss=commit_loss)
|
||||
|
||||
def forward(
|
||||
self, sample: torch.Tensor, return_dict: bool = True
|
||||
@@ -173,9 +175,8 @@ class VQModel(ModelMixin, ConfigMixin):
|
||||
"""
|
||||
|
||||
h = self.encode(sample).latents
|
||||
dec = self.decode(h).sample
|
||||
dec = self.decode(h)
|
||||
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
|
||||
return DecoderOutput(sample=dec)
|
||||
return dec.sample, dec.commit_loss
|
||||
return dec
|
||||
|
||||
@@ -13,7 +13,7 @@ class AnimateDiffPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for AnimateDiff pipelines.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
|
||||
@@ -100,20 +100,16 @@ class MultiControlNetModel(ModelMixin):
|
||||
variant (`str`, *optional*):
|
||||
If specified, weights are saved in the format pytorch_model.<variant>.bin.
|
||||
"""
|
||||
idx = 0
|
||||
model_path_to_save = save_directory
|
||||
for controlnet in self.nets:
|
||||
for idx, controlnet in enumerate(self.nets):
|
||||
suffix = "" if idx == 0 else f"_{idx}"
|
||||
controlnet.save_pretrained(
|
||||
model_path_to_save,
|
||||
save_directory + suffix,
|
||||
is_main_process=is_main_process,
|
||||
save_function=save_function,
|
||||
safe_serialization=safe_serialization,
|
||||
variant=variant,
|
||||
)
|
||||
|
||||
idx += 1
|
||||
model_path_to_save = model_path_to_save + f"_{idx}"
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
|
||||
r"""
|
||||
|
||||
@@ -22,6 +22,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
||||
@@ -926,7 +927,9 @@ class StableDiffusionControlNetPipeline(
|
||||
control_guidance_start: Union[float, List[float]] = 0.0,
|
||||
control_guidance_end: Union[float, List[float]] = 1.0,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1019,11 +1022,11 @@ class StableDiffusionControlNetPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1055,6 +1058,9 @@ class StableDiffusionControlNetPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
|
||||
@@ -21,6 +21,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
||||
@@ -917,7 +918,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
control_guidance_start: Union[float, List[float]] = 0.0,
|
||||
control_guidance_end: Union[float, List[float]] = 1.0,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1004,11 +1007,11 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1040,6 +1043,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
|
||||
@@ -23,6 +23,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
||||
@@ -1134,7 +1135,9 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
control_guidance_start: Union[float, List[float]] = 0.0,
|
||||
control_guidance_end: Union[float, List[float]] = 1.0,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1239,11 +1242,11 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1275,6 +1278,9 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
|
||||
@@ -27,6 +27,7 @@ from transformers import (
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -197,8 +198,26 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
|
||||
_optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
_optional_components = [
|
||||
"tokenizer",
|
||||
"tokenizer_2",
|
||||
"text_encoder",
|
||||
"text_encoder_2",
|
||||
"image_encoder",
|
||||
"feature_extractor",
|
||||
]
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
"negative_pooled_prompt_embeds",
|
||||
"add_neg_time_ids",
|
||||
"mask",
|
||||
"masked_image_latents",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -208,7 +227,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
tokenizer: CLIPTokenizer,
|
||||
tokenizer_2: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
controlnet: ControlNetModel,
|
||||
controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
|
||||
scheduler: KarrasDiffusionSchedulers,
|
||||
requires_aesthetics_score: bool = False,
|
||||
force_zeros_for_empty_prompt: bool = True,
|
||||
@@ -1178,7 +1197,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
aesthetic_score: float = 6.0,
|
||||
negative_aesthetic_score: float = 2.5,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1317,11 +1338,11 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1351,6 +1372,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
@@ -1730,7 +1754,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
|
||||
|
||||
if ip_adapter_image is not None:
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
||||
added_cond_kwargs["image_embeds"] = image_embeds
|
||||
|
||||
if num_channels_unet == 9:
|
||||
|
||||
@@ -30,6 +30,7 @@ from transformers import (
|
||||
|
||||
from diffusers.utils.import_utils import is_invisible_watermark_available
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -235,7 +236,15 @@ class StableDiffusionXLControlNetPipeline(
|
||||
"feature_extractor",
|
||||
"image_encoder",
|
||||
]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
"negative_pooled_prompt_embeds",
|
||||
"negative_add_time_ids",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1031,7 +1040,9 @@ class StableDiffusionXLControlNetPipeline(
|
||||
negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
|
||||
negative_target_size: Optional[Tuple[int, int]] = None,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1169,11 +1180,11 @@ class StableDiffusionXLControlNetPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1203,6 +1214,9 @@ class StableDiffusionXLControlNetPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
@@ -1522,6 +1536,12 @@ class StableDiffusionXLControlNetPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
||||
negative_pooled_prompt_embeds = callback_outputs.pop(
|
||||
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
||||
)
|
||||
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
||||
negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -30,6 +30,7 @@ from transformers import (
|
||||
|
||||
from diffusers.utils.import_utils import is_invisible_watermark_available
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -227,7 +228,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
"feature_extractor",
|
||||
"image_encoder",
|
||||
]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
"negative_pooled_prompt_embeds",
|
||||
"add_neg_time_ids",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1105,7 +1114,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
aesthetic_score: float = 6.0,
|
||||
negative_aesthetic_score: float = 2.5,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1254,11 +1265,11 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1288,6 +1299,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
|
||||
|
||||
# align format for control guidance
|
||||
@@ -1578,6 +1592,12 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
||||
negative_pooled_prompt_embeds = callback_outputs.pop(
|
||||
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
||||
)
|
||||
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
||||
add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
|
||||
@@ -21,6 +21,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
|
||||
@@ -648,7 +649,9 @@ class StableDiffusionControlNetXSPipeline(
|
||||
control_guidance_start: float = 0.0,
|
||||
control_guidance_end: float = 1.0,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
):
|
||||
r"""
|
||||
@@ -715,11 +718,11 @@ class StableDiffusionControlNetXSPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -734,6 +737,9 @@ class StableDiffusionControlNetXSPipeline(
|
||||
"not-safe-for-work" (nsfw) content.
|
||||
"""
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
|
||||
@@ -28,6 +28,7 @@ from transformers import (
|
||||
|
||||
from diffusers.utils.import_utils import is_invisible_watermark_available
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
|
||||
@@ -157,7 +158,15 @@ class StableDiffusionXLControlNetXSPipeline(
|
||||
"text_encoder_2",
|
||||
"feature_extractor",
|
||||
]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
"negative_pooled_prompt_embeds",
|
||||
"negative_add_time_ids",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -739,7 +748,9 @@ class StableDiffusionXLControlNetXSPipeline(
|
||||
negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
|
||||
negative_target_size: Optional[Tuple[int, int]] = None,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
):
|
||||
r"""
|
||||
@@ -851,11 +862,11 @@ class StableDiffusionXLControlNetXSPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -869,6 +880,9 @@ class StableDiffusionXLControlNetXSPipeline(
|
||||
returned, otherwise a `tuple` is returned containing the output images.
|
||||
"""
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
|
||||
@@ -817,7 +817,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
positive_len = 768
|
||||
if isinstance(cross_attention_dim, int):
|
||||
positive_len = cross_attention_dim
|
||||
elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
|
||||
elif isinstance(cross_attention_dim, (list, tuple)):
|
||||
positive_len = cross_attention_dim[0]
|
||||
|
||||
feature_type = "text-only" if attention_type == "gated" else "text-image"
|
||||
|
||||
@@ -76,7 +76,7 @@ class I2VGenXLPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for image-to-video pipeline.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
|
||||
@@ -1216,7 +1216,7 @@ class LEditsPPPipelineStableDiffusion(
|
||||
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
||||
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
image (`PipelineImageInput`):
|
||||
Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
|
||||
ratio.
|
||||
|
||||
@@ -1449,7 +1449,7 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
||||
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
image (`PipelineImageInput`):
|
||||
Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
|
||||
ratio.
|
||||
|
||||
@@ -366,7 +366,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
|
||||
):
|
||||
removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
"The following part of your input was truncated because T5 can only handle sequences up to"
|
||||
f" {max_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
|
||||
|
||||
from ...image_processor import PixArtImageProcessor
|
||||
from ...models import AutoencoderKL, Transformer2DModel
|
||||
from ...schedulers import DPMSolverMultistepScheduler
|
||||
from ...schedulers import KarrasDiffusionSchedulers
|
||||
from ...utils import (
|
||||
BACKENDS_MAPPING,
|
||||
deprecate,
|
||||
@@ -203,7 +203,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
text_encoder: T5EncoderModel,
|
||||
vae: AutoencoderKL,
|
||||
transformer: Transformer2DModel,
|
||||
scheduler: DPMSolverMultistepScheduler,
|
||||
scheduler: KarrasDiffusionSchedulers,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -214,7 +214,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
|
||||
# Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
|
||||
# Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
@@ -227,7 +227,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
prompt_attention_mask: Optional[torch.Tensor] = None,
|
||||
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
||||
clean_caption: bool = False,
|
||||
max_sequence_length: int = 120,
|
||||
max_sequence_length: int = 300,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -254,7 +254,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
string.
|
||||
clean_caption (`bool`, defaults to `False`):
|
||||
If `True`, the function will preprocess and clean the provided caption before encoding.
|
||||
max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
|
||||
max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
|
||||
"""
|
||||
|
||||
if "mask_feature" in kwargs:
|
||||
@@ -292,7 +292,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
):
|
||||
removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
"The following part of your input was truncated because T5 can only handle sequences up to"
|
||||
f" {max_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
@@ -707,7 +707,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
||||
If set to `True`, the requested height and width are first mapped to the closest resolutions using
|
||||
`ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
|
||||
the requested resolution. Useful for generating non-square images.
|
||||
max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
|
||||
max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -844,7 +844,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
|
||||
transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
|
||||
math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
|
||||
|
||||
args:
|
||||
Args:
|
||||
rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
|
||||
number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
|
||||
|
||||
|
||||
@@ -197,7 +197,7 @@ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# verify batch size of prompt and image are same if image is a list or tensor or numpy array
|
||||
if isinstance(image, list) or isinstance(image, np.ndarray):
|
||||
if isinstance(image, (list, np.ndarray)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
@@ -19,6 +18,7 @@ import torch
|
||||
from packaging import version
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...configuration_utils import FrozenDict
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
@@ -775,7 +775,9 @@ class StableDiffusionPipeline(
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
guidance_rescale: float = 0.0,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -845,11 +847,11 @@ class StableDiffusionPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -881,6 +883,9 @@ class StableDiffusionPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
@@ -21,6 +21,7 @@ import torch
|
||||
from packaging import version
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...configuration_utils import FrozenDict
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
@@ -862,7 +863,9 @@ class StableDiffusionImg2ImgPipeline(
|
||||
return_dict: bool = True,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
clip_skip: int = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -932,11 +935,11 @@ class StableDiffusionImg2ImgPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -967,6 +970,9 @@ class StableDiffusionImg2ImgPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
|
||||
@@ -21,6 +21,7 @@ import torch
|
||||
from packaging import version
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...configuration_utils import FrozenDict
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
@@ -1014,7 +1015,9 @@ class StableDiffusionInpaintPipeline(
|
||||
return_dict: bool = True,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
clip_skip: int = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1107,11 +1110,11 @@ class StableDiffusionInpaintPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1171,6 +1174,9 @@ class StableDiffusionInpaintPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
+18
-7
@@ -13,13 +13,14 @@
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
||||
@@ -175,8 +176,11 @@ class StableDiffusionInstructPix2PixPipeline(
|
||||
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -227,15 +231,18 @@ class StableDiffusionInstructPix2PixPipeline(
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -290,6 +297,9 @@ class StableDiffusionInstructPix2PixPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 0. Check inputs
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
@@ -409,6 +419,7 @@ class StableDiffusionInstructPix2PixPipeline(
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
|
||||
+1
-1
@@ -221,7 +221,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
||||
)
|
||||
|
||||
# verify batch size of prompt and image are same if image is a list or tensor
|
||||
if isinstance(image, list) or isinstance(image, torch.Tensor):
|
||||
if isinstance(image, (list, torch.Tensor)):
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
else:
|
||||
|
||||
@@ -468,7 +468,7 @@ class StableDiffusionUpscalePipeline(
|
||||
)
|
||||
|
||||
# verify batch size of prompt and image are same if image is a list or tensor or numpy array
|
||||
if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
|
||||
if isinstance(image, (list, np.ndarray, torch.Tensor)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
|
||||
+1
-1
@@ -185,7 +185,7 @@ def preprocess(image):
|
||||
def preprocess_mask(mask, batch_size: int = 1):
|
||||
if not isinstance(mask, torch.Tensor):
|
||||
# preprocess mask
|
||||
if isinstance(mask, PIL.Image.Image) or isinstance(mask, np.ndarray):
|
||||
if isinstance(mask, (PIL.Image.Image, np.ndarray)):
|
||||
mask = [mask]
|
||||
|
||||
if isinstance(mask, list):
|
||||
|
||||
@@ -24,6 +24,7 @@ from transformers import (
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -861,7 +862,9 @@ class StableDiffusionXLPipeline(
|
||||
negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
|
||||
negative_target_size: Optional[Tuple[int, int]] = None,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -992,11 +995,11 @@ class StableDiffusionXLPipeline(
|
||||
as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
|
||||
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
|
||||
information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1026,6 +1029,9 @@ class StableDiffusionXLPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.default_sample_size * self.vae_scale_factor
|
||||
width = width or self.default_sample_size * self.vae_scale_factor
|
||||
|
||||
+12
-6
@@ -25,6 +25,7 @@ from transformers import (
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -1008,7 +1009,9 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
aesthetic_score: float = 6.0,
|
||||
negative_aesthetic_score: float = 2.5,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1157,11 +1160,11 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1191,6 +1194,9 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
|
||||
+12
-6
@@ -26,6 +26,7 @@ from transformers import (
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import (
|
||||
FromSingleFileMixin,
|
||||
@@ -1243,7 +1244,9 @@ class StableDiffusionXLInpaintPipeline(
|
||||
aesthetic_score: float = 6.0,
|
||||
negative_aesthetic_score: float = 2.5,
|
||||
clip_skip: Optional[int] = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
**kwargs,
|
||||
):
|
||||
@@ -1411,11 +1414,11 @@ class StableDiffusionXLInpaintPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
||||
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
||||
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
||||
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -1445,6 +1448,9 @@ class StableDiffusionXLInpaintPipeline(
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
@@ -15,7 +15,7 @@ class TextToVideoSDPipelineOutput(BaseOutput):
|
||||
"""
|
||||
Output class for text-to-video pipelines.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
|
||||
@@ -347,11 +347,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
otherwise a tuple is returned where the first element is the sample tensor.
|
||||
"""
|
||||
|
||||
if (
|
||||
isinstance(timestep, int)
|
||||
or isinstance(timestep, torch.IntTensor)
|
||||
or isinstance(timestep, torch.LongTensor)
|
||||
):
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
raise ValueError(
|
||||
(
|
||||
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
||||
|
||||
@@ -310,11 +310,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
||||
"""
|
||||
|
||||
if (
|
||||
isinstance(timestep, int)
|
||||
or isinstance(timestep, torch.IntTensor)
|
||||
or isinstance(timestep, torch.LongTensor)
|
||||
):
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
raise ValueError(
|
||||
(
|
||||
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
||||
|
||||
@@ -375,11 +375,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
"""
|
||||
|
||||
if (
|
||||
isinstance(timestep, int)
|
||||
or isinstance(timestep, torch.IntTensor)
|
||||
or isinstance(timestep, torch.LongTensor)
|
||||
):
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
raise ValueError(
|
||||
(
|
||||
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
||||
|
||||
@@ -530,11 +530,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
||||
"""
|
||||
|
||||
if (
|
||||
isinstance(timestep, int)
|
||||
or isinstance(timestep, torch.IntTensor)
|
||||
or isinstance(timestep, torch.LongTensor)
|
||||
):
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
raise ValueError(
|
||||
(
|
||||
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
||||
|
||||
@@ -58,20 +58,25 @@ from .import_utils import (
|
||||
get_objects_from_module,
|
||||
is_accelerate_available,
|
||||
is_accelerate_version,
|
||||
is_bitsandbytes_available,
|
||||
is_bs4_available,
|
||||
is_flax_available,
|
||||
is_ftfy_available,
|
||||
is_google_colab,
|
||||
is_inflect_available,
|
||||
is_invisible_watermark_available,
|
||||
is_k_diffusion_available,
|
||||
is_k_diffusion_version,
|
||||
is_librosa_available,
|
||||
is_note_seq_available,
|
||||
is_notebook,
|
||||
is_onnx_available,
|
||||
is_peft_available,
|
||||
is_peft_version,
|
||||
is_safetensors_available,
|
||||
is_scipy_available,
|
||||
is_tensorboard_available,
|
||||
is_timm_available,
|
||||
is_torch_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_version,
|
||||
|
||||
@@ -295,6 +295,39 @@ try:
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
_torchvision_available = False
|
||||
|
||||
_timm_available = importlib.util.find_spec("timm") is not None
|
||||
if _timm_available:
|
||||
try:
|
||||
_timm_version = importlib_metadata.version("timm")
|
||||
logger.info(f"Timm version {_timm_version} available.")
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
_timm_available = False
|
||||
|
||||
|
||||
def is_timm_available():
|
||||
return _timm_available
|
||||
|
||||
|
||||
_bitsandbytes_available = importlib.util.find_spec("bitsandbytes") is not None
|
||||
try:
|
||||
_bitsandbytes_version = importlib_metadata.version("bitsandbytes")
|
||||
logger.debug(f"Successfully imported bitsandbytes version {_bitsandbytes_version}")
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
_bitsandbytes_available = False
|
||||
|
||||
# Taken from `huggingface_hub`.
|
||||
_is_notebook = False
|
||||
try:
|
||||
shell_class = get_ipython().__class__ # type: ignore # noqa: F821
|
||||
for parent_class in shell_class.__mro__: # e.g. "is subclass of"
|
||||
if parent_class.__name__ == "ZMQInteractiveShell":
|
||||
_is_notebook = True # Jupyter notebook, Google colab or qtconsole
|
||||
break
|
||||
except NameError:
|
||||
pass # Probably standard Python interpreter
|
||||
|
||||
_is_google_colab = "google.colab" in sys.modules
|
||||
|
||||
|
||||
def is_torch_available():
|
||||
return _torch_available
|
||||
@@ -392,6 +425,22 @@ def is_torchvision_available():
|
||||
return _torchvision_available
|
||||
|
||||
|
||||
def is_safetensors_available():
|
||||
return _safetensors_available
|
||||
|
||||
|
||||
def is_bitsandbytes_available():
|
||||
return _bitsandbytes_available
|
||||
|
||||
|
||||
def is_notebook():
|
||||
return _is_notebook
|
||||
|
||||
|
||||
def is_google_colab():
|
||||
return _is_google_colab
|
||||
|
||||
|
||||
# docstyle-ignore
|
||||
FLAX_IMPORT_ERROR = """
|
||||
{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
|
||||
@@ -499,6 +548,20 @@ INVISIBLE_WATERMARK_IMPORT_ERROR = """
|
||||
{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=0.2.0`
|
||||
"""
|
||||
|
||||
# docstyle-ignore
|
||||
PEFT_IMPORT_ERROR = """
|
||||
{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install peft`
|
||||
"""
|
||||
|
||||
# docstyle-ignore
|
||||
SAFETENSORS_IMPORT_ERROR = """
|
||||
{0} requires the safetensors library but it was not found in your environment. You can install it with pip: `pip install safetensors`
|
||||
"""
|
||||
|
||||
# docstyle-ignore
|
||||
BITSANDBYTES_IMPORT_ERROR = """
|
||||
{0} requires the bitsandbytes library but it was not found in your environment. You can install it with pip: `pip install bitsandbytes`
|
||||
"""
|
||||
|
||||
BACKENDS_MAPPING = OrderedDict(
|
||||
[
|
||||
@@ -520,6 +583,9 @@ BACKENDS_MAPPING = OrderedDict(
|
||||
("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
|
||||
("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)),
|
||||
("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)),
|
||||
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
|
||||
("safetensors", (is_safetensors_available, SAFETENSORS_IMPORT_ERROR)),
|
||||
("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ from .import_utils import (
|
||||
is_onnx_available,
|
||||
is_opencv_available,
|
||||
is_peft_available,
|
||||
is_timm_available,
|
||||
is_torch_available,
|
||||
is_torch_version,
|
||||
is_torchsde_available,
|
||||
@@ -340,6 +341,13 @@ def require_peft_backend(test_case):
|
||||
return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case)
|
||||
|
||||
|
||||
def require_timm(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires timm. These tests are skipped when timm isn't installed.
|
||||
"""
|
||||
return unittest.skipUnless(is_timm_available(), "test requires timm")(test_case)
|
||||
|
||||
|
||||
def require_peft_version_greater(peft_version):
|
||||
"""
|
||||
Decorator marking a test that requires PEFT backend with a specific version, this would require some specific
|
||||
|
||||
@@ -30,17 +30,19 @@ class VideoProcessor(VaeImageProcessor):
|
||||
Preprocesses input video(s).
|
||||
|
||||
Args:
|
||||
video: The input video. It can be one of the following:
|
||||
video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`):
|
||||
The input video. It can be one of the following:
|
||||
* List of the PIL images.
|
||||
* List of list of PIL images.
|
||||
* 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)).
|
||||
* 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)).
|
||||
* List of 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)).
|
||||
* List of 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)).
|
||||
* 5D NumPy arrays: expected shape for each array: (batch_size, num_frames, height, width,
|
||||
num_channels).
|
||||
* 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height,
|
||||
width).
|
||||
* 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height, width)`).
|
||||
* 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
|
||||
* List of 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height,
|
||||
width)`).
|
||||
* List of 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
|
||||
* 5D NumPy arrays: expected shape for each array `(batch_size, num_frames, height, width,
|
||||
num_channels)`.
|
||||
* 5D Torch tensors: expected shape for each array `(batch_size, num_frames, num_channels, height,
|
||||
width)`.
|
||||
height (`int`, *optional*, defaults to `None`):
|
||||
The height in preprocessed frames of the video. If `None`, will use the `get_default_height_width()` to
|
||||
get default height.
|
||||
|
||||
@@ -224,7 +224,7 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
|
||||
).images
|
||||
|
||||
images = images[0, -3:, -3:, -1].flatten()
|
||||
expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
|
||||
expected = np.array([00.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535])
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected, images)
|
||||
assert max_diff < 1e-4
|
||||
@@ -507,13 +507,12 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
|
||||
)
|
||||
|
||||
images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
|
||||
|
||||
assert images[0].shape == (768, 512, 3)
|
||||
|
||||
original_image = images[0, -3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.4574, 0.4461, 0.4435, 0.4462, 0.4396, 0.439, 0.4474, 0.4486, 0.4333])
|
||||
expected_image = np.array([0.4574, 0.4487, 0.4435, 0.5163, 0.4396, 0.4411, 0.518, 0.4465, 0.4333])
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected_image, original_image)
|
||||
assert max_diff < 1e-4
|
||||
|
||||
@@ -98,3 +98,19 @@ class VQModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
|
||||
expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
|
||||
# fmt: on
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_loss_pretrained(self):
|
||||
model = VQModel.from_pretrained("fusing/vqgan-dummy")
|
||||
model.to(torch_device).eval()
|
||||
|
||||
torch.manual_seed(0)
|
||||
backend_manual_seed(torch_device, 0)
|
||||
|
||||
image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
|
||||
image = image.to(torch_device)
|
||||
with torch.no_grad():
|
||||
output = model(image).commit_loss.cpu()
|
||||
# fmt: off
|
||||
expected_output = torch.tensor([0.1936])
|
||||
# fmt: on
|
||||
self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
|
||||
|
||||
@@ -1,191 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import StableCascadeUNet
|
||||
from diffusers.utils import logging
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
@slow
|
||||
class StableCascadeUNetModelSlowTests(unittest.TestCase):
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_stable_cascade_unet_prior_single_file_components(self):
|
||||
single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
|
||||
single_file_unet = StableCascadeUNet.from_single_file(single_file_url)
|
||||
|
||||
single_file_unet_config = single_file_unet.config
|
||||
del single_file_unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior", variant="bf16")
|
||||
unet_config = unet.config
|
||||
del unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
|
||||
for param_name, param_value in single_file_unet_config.items():
|
||||
if param_name in PARAMS_TO_IGNORE:
|
||||
continue
|
||||
|
||||
assert unet_config[param_name] == param_value
|
||||
|
||||
def test_stable_cascade_unet_decoder_single_file_components(self):
|
||||
single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors"
|
||||
single_file_unet = StableCascadeUNet.from_single_file(single_file_url)
|
||||
|
||||
single_file_unet_config = single_file_unet.config
|
||||
del single_file_unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder", variant="bf16")
|
||||
unet_config = unet.config
|
||||
del unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
|
||||
for param_name, param_value in single_file_unet_config.items():
|
||||
if param_name in PARAMS_TO_IGNORE:
|
||||
continue
|
||||
|
||||
assert unet_config[param_name] == param_value
|
||||
|
||||
def test_stable_cascade_unet_config_loading(self):
|
||||
config = StableCascadeUNet.load_config(
|
||||
pretrained_model_name_or_path="diffusers/stable-cascade-configs", subfolder="prior"
|
||||
)
|
||||
single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
|
||||
|
||||
single_file_unet = StableCascadeUNet.from_single_file(single_file_url, config=config)
|
||||
single_file_unet_config = single_file_unet.config
|
||||
del single_file_unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
|
||||
for param_name, param_value in config.items():
|
||||
if param_name in PARAMS_TO_IGNORE:
|
||||
continue
|
||||
|
||||
assert single_file_unet_config[param_name] == param_value
|
||||
|
||||
@require_torch_gpu
|
||||
def test_stable_cascade_unet_single_file_prior_forward_pass(self):
|
||||
dtype = torch.bfloat16
|
||||
generator = torch.Generator("cpu")
|
||||
|
||||
model_inputs = {
|
||||
"sample": randn_tensor((1, 16, 24, 24), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"timestep_ratio": torch.tensor([1]).to("cuda", dtype),
|
||||
"clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"clip_img": randn_tensor((1, 1, 768), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
}
|
||||
|
||||
unet = StableCascadeUNet.from_pretrained(
|
||||
"stabilityai/stable-cascade-prior",
|
||||
subfolder="prior",
|
||||
revision="refs/pr/2",
|
||||
variant="bf16",
|
||||
torch_dtype=dtype,
|
||||
)
|
||||
unet.to("cuda")
|
||||
with torch.no_grad():
|
||||
prior_output = unet(**model_inputs).sample.float().cpu().numpy()
|
||||
|
||||
# Remove UNet from GPU memory before loading the single file UNet model
|
||||
del unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
|
||||
single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype)
|
||||
single_file_unet.to("cuda")
|
||||
with torch.no_grad():
|
||||
prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy()
|
||||
|
||||
# Remove UNet from GPU memory before loading the single file UNet model
|
||||
del single_file_unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten())
|
||||
assert max_diff < 8e-3
|
||||
|
||||
@require_torch_gpu
|
||||
def test_stable_cascade_unet_single_file_decoder_forward_pass(self):
|
||||
dtype = torch.float32
|
||||
generator = torch.Generator("cpu")
|
||||
|
||||
model_inputs = {
|
||||
"sample": randn_tensor((1, 4, 256, 256), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"timestep_ratio": torch.tensor([1]).to("cuda", dtype),
|
||||
"clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
"pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype),
|
||||
}
|
||||
|
||||
unet = StableCascadeUNet.from_pretrained(
|
||||
"stabilityai/stable-cascade",
|
||||
subfolder="decoder",
|
||||
revision="refs/pr/44",
|
||||
torch_dtype=dtype,
|
||||
)
|
||||
unet.to("cuda")
|
||||
with torch.no_grad():
|
||||
prior_output = unet(**model_inputs).sample.float().cpu().numpy()
|
||||
|
||||
# Remove UNet from GPU memory before loading the single file UNet model
|
||||
del unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b.safetensors"
|
||||
single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype)
|
||||
single_file_unet.to("cuda")
|
||||
with torch.no_grad():
|
||||
prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy()
|
||||
|
||||
# Remove UNet from GPU memory before loading the single file UNet model
|
||||
del single_file_unet
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten())
|
||||
assert max_diff < 1e-4
|
||||
@@ -19,7 +19,15 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
from transformers import (
|
||||
CLIPImageProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionConfig,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -34,6 +42,7 @@ from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor
|
||||
from ..pipeline_params import (
|
||||
IMAGE_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_BATCH_PARAMS,
|
||||
TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
|
||||
TEXT_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_PARAMS,
|
||||
)
|
||||
@@ -55,6 +64,14 @@ class ControlNetPipelineSDXLFastTests(
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = frozenset(IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"mask_image", "control_image"}))
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
|
||||
{
|
||||
"add_text_embeds",
|
||||
"add_time_ids",
|
||||
"mask",
|
||||
"masked_image_latents",
|
||||
}
|
||||
)
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
@@ -129,6 +146,30 @@ class ControlNetPipelineSDXLFastTests(
|
||||
text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
image_encoder_config = CLIPVisionConfig(
|
||||
hidden_size=32,
|
||||
image_size=224,
|
||||
projection_dim=32,
|
||||
intermediate_size=37,
|
||||
num_attention_heads=4,
|
||||
num_channels=3,
|
||||
num_hidden_layers=5,
|
||||
patch_size=14,
|
||||
)
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
|
||||
|
||||
feature_extractor = CLIPImageProcessor(
|
||||
crop_size=224,
|
||||
do_center_crop=True,
|
||||
do_normalize=True,
|
||||
do_resize=True,
|
||||
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
||||
image_std=[0.26862954, 0.26130258, 0.27577711],
|
||||
resample=3,
|
||||
size=224,
|
||||
)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"controlnet": controlnet,
|
||||
@@ -138,6 +179,8 @@ class ControlNetPipelineSDXLFastTests(
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"image_encoder": image_encoder,
|
||||
"feature_extractor": feature_extractor,
|
||||
}
|
||||
return components
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ from ..pipeline_params import (
|
||||
IMAGE_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
|
||||
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
|
||||
TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import (
|
||||
IPAdapterTesterMixin,
|
||||
@@ -55,9 +56,13 @@ class ControlNetPipelineSDXLImg2ImgFastTests(
|
||||
):
|
||||
pipeline_class = StableDiffusionXLControlNetImg2ImgPipeline
|
||||
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
|
||||
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
|
||||
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
|
||||
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
|
||||
callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
|
||||
{"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
|
||||
)
|
||||
|
||||
def get_dummy_components(self, skip_first_text_encoder=False):
|
||||
torch.manual_seed(0)
|
||||
|
||||
@@ -336,7 +336,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
|
||||
image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589])
|
||||
expected_slice = np.array([0.4517, 0.4446, 0.4375, 0.449, 0.4399, 0.4365, 0.4583, 0.4629, 0.4473])
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
|
||||
self.assertLessEqual(max_diff, 1e-4)
|
||||
@@ -344,7 +344,12 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
|
||||
def test_pixart_512(self):
|
||||
generator = torch.Generator("cpu").manual_seed(0)
|
||||
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
|
||||
transformer = Transformer2DModel.from_pretrained(
|
||||
self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
prompt = self.prompt
|
||||
@@ -352,7 +357,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
|
||||
image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958])
|
||||
expected_slice = np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017])
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
|
||||
self.assertLessEqual(max_diff, 1e-4)
|
||||
@@ -394,7 +399,12 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
|
||||
def test_pixart_512_without_resolution_binning(self):
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
|
||||
transformer = Transformer2DModel.from_pretrained(
|
||||
self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
prompt = self.prompt
|
||||
|
||||
@@ -82,6 +82,7 @@ class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFile
|
||||
assert pipe.vae.config.scaling_factor == new_scaling_factor
|
||||
|
||||
|
||||
@slow
|
||||
class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
|
||||
pipeline_class = StableDiffusionPipeline
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
|
||||
|
||||
@@ -73,7 +73,7 @@ diffusers_module = spec.loader.load_module()
|
||||
|
||||
# Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
|
||||
def camel_case_split(identifier):
|
||||
"Split a camelcased `identifier` into words."
|
||||
"""Split a camelcased `identifier` into words."""
|
||||
matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
|
||||
return [m.group(0) for m in matches]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user