add: utility to format our docs too 📜 (#7314)
* add: utility to format our docs too 📜
* debugging saga
* fix: message
* checking
* should be fixed.
* revert pipeline_fixture
* remove empty line
* make style
* fix: setup.py
* style.
This commit is contained in:
@@ -32,9 +32,7 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
@@ -53,7 +51,7 @@ jobs:
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
|
||||
@@ -40,9 +40,7 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
@@ -61,7 +59,7 @@ jobs:
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
|
||||
@@ -42,6 +42,7 @@ repo-consistency:
|
||||
quality:
|
||||
ruff check $(check_dirs) setup.py
|
||||
ruff format --check $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
# Format source code automatically and check is there are any problems left that need manual fixing
|
||||
@@ -55,6 +56,7 @@ extra_style_checks:
|
||||
style:
|
||||
ruff check $(check_dirs) setup.py --fix
|
||||
ruff format $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
||||
@@ -134,6 +134,7 @@ _deps = [
|
||||
"torchvision",
|
||||
"transformers>=4.25.1",
|
||||
"urllib3<=2.0.0",
|
||||
"black",
|
||||
]
|
||||
|
||||
# this is a lookup table with items like:
|
||||
|
||||
@@ -42,4 +42,5 @@ deps = {
|
||||
"torchvision": "torchvision",
|
||||
"transformers": "transformers>=4.25.1",
|
||||
"urllib3": "urllib3<=2.0.0",
|
||||
"black": "black",
|
||||
}
|
||||
|
||||
@@ -173,8 +173,9 @@ class VaeImageProcessor(ConfigMixin):
|
||||
@staticmethod
|
||||
def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
|
||||
"""
|
||||
Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect ratio of the original image;
|
||||
for example, if user drew mask in a 128x32 region, and the dimensions for processing are 512x512, the region will be expanded to 128x128.
|
||||
Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect
|
||||
ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for
|
||||
processing are 512x512, the region will be expanded to 128x128.
|
||||
|
||||
Args:
|
||||
mask_image (PIL.Image.Image): Mask image.
|
||||
@@ -183,7 +184,8 @@ class VaeImageProcessor(ConfigMixin):
|
||||
pad (int, optional): Padding to be added to the crop region. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and matches the original aspect ratio.
|
||||
tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and
|
||||
matches the original aspect ratio.
|
||||
"""
|
||||
|
||||
mask_image = mask_image.convert("L")
|
||||
@@ -265,7 +267,8 @@ class VaeImageProcessor(ConfigMixin):
|
||||
height: int,
|
||||
) -> PIL.Image.Image:
|
||||
"""
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
||||
the image within the dimensions, filling empty with data from image.
|
||||
|
||||
Args:
|
||||
image: The image to resize.
|
||||
@@ -309,7 +312,8 @@ class VaeImageProcessor(ConfigMixin):
|
||||
height: int,
|
||||
) -> PIL.Image.Image:
|
||||
"""
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
||||
the image within the dimensions, cropping the excess.
|
||||
|
||||
Args:
|
||||
image: The image to resize.
|
||||
@@ -346,12 +350,12 @@ class VaeImageProcessor(ConfigMixin):
|
||||
The width to resize to.
|
||||
resize_mode (`str`, *optional*, defaults to `default`):
|
||||
The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
|
||||
within the specified width and height, and it may not maintaining the original aspect ratio.
|
||||
If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, filling empty with data from image.
|
||||
If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, cropping the excess.
|
||||
Note that resize_mode `fill` and `crop` are only supported for PIL image input.
|
||||
within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`,
|
||||
will resize the image to fit within the specified width and height, maintaining the aspect ratio, and
|
||||
then center the image within the dimensions, filling empty with data from image. If `crop`, will resize
|
||||
the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
||||
the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
|
||||
supported for PIL image input.
|
||||
|
||||
Returns:
|
||||
`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
|
||||
@@ -456,19 +460,21 @@ class VaeImageProcessor(ConfigMixin):
|
||||
|
||||
Args:
|
||||
image (`pipeline_image_input`):
|
||||
The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of supported formats.
|
||||
The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
|
||||
supported formats.
|
||||
height (`int`, *optional*, defaults to `None`):
|
||||
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
|
||||
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
|
||||
height.
|
||||
width (`int`, *optional*`, defaults to `None`):
|
||||
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
||||
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
||||
resize_mode (`str`, *optional*, defaults to `default`):
|
||||
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
|
||||
within the specified width and height, and it may not maintaining the original aspect ratio.
|
||||
If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, filling empty with data from image.
|
||||
If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, cropping the excess.
|
||||
Note that resize_mode `fill` and `crop` are only supported for PIL image input.
|
||||
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
|
||||
the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
|
||||
resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
|
||||
center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
|
||||
image to fit within the specified width and height, maintaining the aspect ratio, and then center the
|
||||
image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
|
||||
supported for PIL image input.
|
||||
crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
|
||||
The crop coordinates for each image in the batch. If `None`, will not crop the image.
|
||||
"""
|
||||
@@ -930,8 +936,8 @@ class IPAdapterMaskProcessor(VaeImageProcessor):
|
||||
@staticmethod
|
||||
def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int):
|
||||
"""
|
||||
Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention.
|
||||
If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.
|
||||
Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
|
||||
aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.
|
||||
|
||||
Args:
|
||||
mask (`torch.FloatTensor`):
|
||||
|
||||
@@ -67,17 +67,18 @@ class IPAdapterMixin:
|
||||
- A [torch state
|
||||
dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
|
||||
subfolder (`str` or `List[str]`):
|
||||
The subfolder location of a model file within a larger model repository on the Hub or locally.
|
||||
If a list is passed, it should have the same length as `weight_name`.
|
||||
The subfolder location of a model file within a larger model repository on the Hub or locally. If a
|
||||
list is passed, it should have the same length as `weight_name`.
|
||||
weight_name (`str` or `List[str]`):
|
||||
The name of the weight file to load. If a list is passed, it should have the same length as
|
||||
`weight_name`.
|
||||
image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
|
||||
The subfolder location of the image encoder within a larger model repository on the Hub or locally.
|
||||
Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`,
|
||||
you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`.
|
||||
If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights,
|
||||
for example, `image_encoder_folder="different_subfolder/image_encoder"`.
|
||||
Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
|
||||
`subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
|
||||
`image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
|
||||
`subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
|
||||
`image_encoder_folder="different_subfolder/image_encoder"`.
|
||||
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
||||
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
||||
is not used.
|
||||
|
||||
@@ -20,7 +20,8 @@ from ..utils import MIN_PEFT_VERSION, check_peft_version, is_peft_available
|
||||
class PeftAdapterMixin:
|
||||
"""
|
||||
A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
|
||||
more details about adapters and injecting them in a transformer-based model, check out the PEFT [documentation](https://huggingface.co/docs/peft/index).
|
||||
more details about adapters and injecting them in a transformer-based model, check out the PEFT
|
||||
[documentation](https://huggingface.co/docs/peft/index).
|
||||
|
||||
Install the latest version of PEFT, and use this mixin to:
|
||||
|
||||
@@ -143,8 +144,8 @@ class PeftAdapterMixin:
|
||||
|
||||
def enable_adapters(self) -> None:
|
||||
"""
|
||||
Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the
|
||||
list of adapters to enable.
|
||||
Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the list of
|
||||
adapters to enable.
|
||||
|
||||
If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
|
||||
[documentation](https://huggingface.co/docs/peft).
|
||||
|
||||
@@ -198,19 +198,24 @@ class FromSingleFileMixin:
|
||||
model_type (`str`, *optional*):
|
||||
The type of model to load. If not provided, the model type will be inferred from the checkpoint file.
|
||||
image_size (`int`, *optional*):
|
||||
The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE model.
|
||||
The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE
|
||||
model.
|
||||
load_safety_checker (`bool`, *optional*, defaults to `False`):
|
||||
Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a `safety_checker` component is passed to the `kwargs`.
|
||||
Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a
|
||||
`safety_checker` component is passed to the `kwargs`.
|
||||
num_in_channels (`int`, *optional*):
|
||||
Specify the number of input channels for the UNet model. Read more about how to configure UNet model with this parameter
|
||||
Specify the number of input channels for the UNet model. Read more about how to configure UNet model
|
||||
with this parameter
|
||||
[here](https://huggingface.co/docs/diffusers/training/adapt_a_model#configure-unet2dconditionmodel-parameters).
|
||||
scaling_factor (`float`, *optional*):
|
||||
The scaling factor to use for the VAE model. If not provided, it is inferred from the config file first.
|
||||
If the scaling factor is not found in the config file, the default value 0.18215 is used.
|
||||
The scaling factor to use for the VAE model. If not provided, it is inferred from the config file
|
||||
first. If the scaling factor is not found in the config file, the default value 0.18215 is used.
|
||||
scheduler_type (`str`, *optional*):
|
||||
The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint file.
|
||||
The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint
|
||||
file.
|
||||
prediction_type (`str`, *optional*):
|
||||
The type of prediction to load. If not provided, the prediction type will be inferred from the checkpoint file.
|
||||
The type of prediction to load. If not provided, the prediction type will be inferred from the
|
||||
checkpoint file.
|
||||
kwargs (remaining dictionary of keyword arguments, *optional*):
|
||||
Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
|
||||
class). The overwritten components are passed directly to the pipelines `__init__` method. See example
|
||||
|
||||
@@ -487,20 +487,35 @@ class TextualInversionLoaderMixin:
|
||||
|
||||
# Example 3: unload from SDXL
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
|
||||
embedding_path = hf_hub_download(repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model")
|
||||
embedding_path = hf_hub_download(
|
||||
repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model"
|
||||
)
|
||||
|
||||
# load embeddings to the text encoders
|
||||
state_dict = load_file(embedding_path)
|
||||
|
||||
# load embeddings of text_encoder 1 (CLIP ViT-L/14)
|
||||
pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
|
||||
pipeline.load_textual_inversion(
|
||||
state_dict["clip_l"],
|
||||
token=["<s0>", "<s1>"],
|
||||
text_encoder=pipeline.text_encoder,
|
||||
tokenizer=pipeline.tokenizer,
|
||||
)
|
||||
# load embeddings of text_encoder 2 (CLIP ViT-G/14)
|
||||
pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
|
||||
pipeline.load_textual_inversion(
|
||||
state_dict["clip_g"],
|
||||
token=["<s0>", "<s1>"],
|
||||
text_encoder=pipeline.text_encoder_2,
|
||||
tokenizer=pipeline.tokenizer_2,
|
||||
)
|
||||
|
||||
# Unload explicitly from both text encoders abd tokenizers
|
||||
pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
|
||||
pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
|
||||
|
||||
pipeline.unload_textual_inversion(
|
||||
tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer
|
||||
)
|
||||
pipeline.unload_textual_inversion(
|
||||
tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
@@ -74,37 +74,24 @@ def _maybe_expand_lora_scales_for_one_adapter(
|
||||
|
||||
E.g. turns
|
||||
```python
|
||||
scales = {
|
||||
'down': 2,
|
||||
'mid': 3,
|
||||
'up': {
|
||||
'block_0': 4,
|
||||
'block_1': [5, 6, 7]
|
||||
}
|
||||
}
|
||||
blocks_with_transformer = {
|
||||
'down': [1,2],
|
||||
'up': [0,1]
|
||||
}
|
||||
transformer_per_block = {
|
||||
'down': 2,
|
||||
'up': 3
|
||||
}
|
||||
scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}}
|
||||
blocks_with_transformer = {"down": [1, 2], "up": [0, 1]}
|
||||
transformer_per_block = {"down": 2, "up": 3}
|
||||
```
|
||||
into
|
||||
```python
|
||||
{
|
||||
'down.block_1.0': 2,
|
||||
'down.block_1.1': 2,
|
||||
'down.block_2.0': 2,
|
||||
'down.block_2.1': 2,
|
||||
'mid': 3,
|
||||
'up.block_0.0': 4,
|
||||
'up.block_0.1': 4,
|
||||
'up.block_0.2': 4,
|
||||
'up.block_1.0': 5,
|
||||
'up.block_1.1': 6,
|
||||
'up.block_1.2': 7,
|
||||
"down.block_1.0": 2,
|
||||
"down.block_1.1": 2,
|
||||
"down.block_2.0": 2,
|
||||
"down.block_2.1": 2,
|
||||
"mid": 3,
|
||||
"up.block_0.0": 4,
|
||||
"up.block_0.1": 4,
|
||||
"up.block_0.2": 4,
|
||||
"up.block_1.0": 5,
|
||||
"up.block_1.1": 6,
|
||||
"up.block_1.2": 7,
|
||||
}
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -1298,9 +1298,9 @@ class AttnProcessor2_0:
|
||||
|
||||
class FusedAttnProcessor2_0:
|
||||
r"""
|
||||
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
|
||||
It uses fused projection layers. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
|
||||
fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
|
||||
For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -453,8 +453,8 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
|
||||
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -329,15 +329,15 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor
|
||||
conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
|
||||
plain tuple.
|
||||
Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
|
||||
a plain tuple.
|
||||
train (`bool`, *optional*, defaults to `False`):
|
||||
Use deterministic functions and disable dropout when not training.
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a
|
||||
`tuple`. When returning a tuple, the first element is the sample tensor.
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise
|
||||
a `tuple`. When returning a tuple, the first element is the sample tensor.
|
||||
"""
|
||||
channel_order = self.controlnet_conditioning_channel_order
|
||||
if channel_order == "bgr":
|
||||
|
||||
@@ -795,16 +795,13 @@ class IPAdapterPlusImageProjection(nn.Module):
|
||||
|
||||
Args:
|
||||
----
|
||||
embed_dims (int): The feature dimension. Defaults to 768.
|
||||
output_dims (int): The number of output channels, that is the same
|
||||
number of the channels in the
|
||||
`unet.config.cross_attention_dim`. Defaults to 1024.
|
||||
hidden_dims (int): The number of hidden channels. Defaults to 1280.
|
||||
depth (int): The number of blocks. Defaults to 8.
|
||||
dim_head (int): The number of head channels. Defaults to 64.
|
||||
heads (int): Parallel attention heads. Defaults to 16.
|
||||
num_queries (int): The number of queries. Defaults to 8.
|
||||
ffn_ratio (float): The expansion ratio of feedforward network hidden
|
||||
embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
|
||||
that is the same
|
||||
number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
|
||||
hidden_dims (int): The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
|
||||
to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
|
||||
Defaults to 16. num_queries (int): The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
|
||||
of feedforward network hidden
|
||||
layer channels. Defaults to 4.
|
||||
"""
|
||||
|
||||
|
||||
@@ -202,8 +202,8 @@ class ResnetBlock2D(nn.Module):
|
||||
eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
|
||||
non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
|
||||
time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
|
||||
By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift"
|
||||
for a stronger conditioning with scale and shift.
|
||||
By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
|
||||
stronger conditioning with scale and shift.
|
||||
kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
|
||||
[`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
|
||||
output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
|
||||
|
||||
@@ -120,7 +120,8 @@ class DualTransformer2DModel(nn.Module):
|
||||
`self.processor` in
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
|
||||
Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
|
||||
|
||||
@@ -294,8 +294,8 @@ class TransformerSpatioTemporalModel(nn.Module):
|
||||
A tensor indicating whether the input contains only images. 1 indicates that the input contains only
|
||||
images, 0 indicates that the input contains video frames.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain
|
||||
tuple.
|
||||
Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a
|
||||
plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
|
||||
|
||||
@@ -865,8 +865,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
|
||||
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
@@ -1093,8 +1093,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
|
||||
a `tuple` is returned where the first element is the sample tensor.
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
|
||||
otherwise a `tuple` is returned where the first element is the sample tensor.
|
||||
"""
|
||||
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
||||
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
|
||||
|
||||
@@ -76,7 +76,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
|
||||
The tuple of upsample blocks to use.
|
||||
mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
|
||||
Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer is skipped.
|
||||
Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer
|
||||
is skipped.
|
||||
block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
|
||||
The tuple of output channels for each block.
|
||||
layers_per_block (`int`, *optional*, defaults to 2):
|
||||
@@ -350,15 +351,15 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
mid_block_additional_residual: (`torch.Tensor`, *optional*):
|
||||
A tensor that if specified is added to the residual of the middle unet block.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
|
||||
plain tuple.
|
||||
Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
|
||||
a plain tuple.
|
||||
train (`bool`, *optional*, defaults to `False`):
|
||||
Use deterministic functions and disable dropout when not training.
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`.
|
||||
When returning a tuple, the first element is the sample tensor.
|
||||
[`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a
|
||||
`tuple`. When returning a tuple, the first element is the sample tensor.
|
||||
"""
|
||||
# 1. time
|
||||
if not isinstance(timesteps, jnp.ndarray):
|
||||
|
||||
@@ -511,8 +511,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
|
||||
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -99,8 +99,8 @@ class I2VGenXLTransformerTemporalEncoder(nn.Module):
|
||||
|
||||
class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
r"""
|
||||
I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep
|
||||
and returns a sample-shaped output.
|
||||
I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
|
||||
returns a sample-shaped output.
|
||||
|
||||
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
|
||||
for all models (such as downloading or saving).
|
||||
@@ -477,8 +477,8 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
@@ -533,7 +533,8 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
|
||||
fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
|
||||
image_latents (`torch.FloatTensor`): Image encodings from the VAE.
|
||||
image_embeddings (`torch.FloatTensor`): Projection embeddings of the conditioning image computed with a vision encoder.
|
||||
image_embeddings (`torch.FloatTensor`):
|
||||
Projection embeddings of the conditioning image computed with a vision encoder.
|
||||
encoder_hidden_states (`torch.FloatTensor`):
|
||||
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
|
||||
@@ -709,8 +709,8 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -31,8 +31,8 @@ class UNetSpatioTemporalConditionOutput(BaseOutput):
|
||||
|
||||
class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
r"""
|
||||
A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
|
||||
shaped output.
|
||||
A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
|
||||
returns a sample shaped output.
|
||||
|
||||
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
|
||||
for all models (such as downloading or saving).
|
||||
@@ -57,7 +57,8 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
|
||||
The dimension of the cross attention features.
|
||||
transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
|
||||
The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
|
||||
[`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
|
||||
[`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
|
||||
[`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
|
||||
[`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
|
||||
num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
|
||||
The number of attention heads.
|
||||
@@ -374,12 +375,12 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
|
||||
The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
|
||||
embeddings and added to the time embeddings.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
|
||||
tuple.
|
||||
Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
|
||||
of a plain tuple.
|
||||
Returns:
|
||||
[`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
|
||||
If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
|
||||
a `tuple` is returned where the first element is the sample tensor.
|
||||
If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is
|
||||
returned, otherwise a `tuple` is returned where the first element is the sample tensor.
|
||||
"""
|
||||
# 1. time
|
||||
timesteps = timestep
|
||||
|
||||
@@ -186,7 +186,8 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
|
||||
block_out_channels (Tuple[int], defaults to (2048, 2048)):
|
||||
Tuple of output channels for each block.
|
||||
num_attention_heads (Tuple[int], defaults to (32, 32)):
|
||||
Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention.
|
||||
Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have
|
||||
attention.
|
||||
down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
|
||||
Number of layers in each down block.
|
||||
up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
|
||||
@@ -197,10 +198,9 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
|
||||
Number of 1x1 Convolutional layers to repeat in each up block.
|
||||
block_types_per_layer (Tuple[Tuple[str]], optional,
|
||||
defaults to (
|
||||
("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
|
||||
("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
|
||||
):
|
||||
Block types used in each layer of the up/down blocks.
|
||||
("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock",
|
||||
"SDCascadeTimestepBlock", "SDCascadeAttnBlock")
|
||||
): Block types used in each layer of the up/down blocks.
|
||||
clip_text_in_channels (`int`, *optional*, defaults to `None`):
|
||||
Number of input channels for CLIP based text conditioning.
|
||||
clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
|
||||
|
||||
@@ -30,9 +30,7 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> import torch
|
||||
>>> from diffusers import AmusedPipeline
|
||||
|
||||
>>> pipe = AmusedPipeline.from_pretrained(
|
||||
... "amused/amused-512", variant="fp16", torch_dtype=torch.float16
|
||||
... )
|
||||
>>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> prompt = "a photo of an astronaut riding a horse on mars"
|
||||
@@ -150,10 +148,12 @@ class AmusedPipeline(DiffusionPipeline):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
|
||||
and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See
|
||||
https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
|
||||
Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
|
||||
|
||||
|
||||
@@ -167,10 +167,12 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
|
||||
and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See
|
||||
https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
|
||||
Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
|
||||
|
||||
|
||||
@@ -191,10 +191,12 @@ class AmusedInpaintPipeline(DiffusionPipeline):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
|
||||
and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted aesthetic score according to the laion aesthetic classifier. See
|
||||
https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
|
||||
The targeted height, width crop coordinates. See the micro-conditioning section of
|
||||
https://arxiv.org/abs/2307.01952.
|
||||
temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
|
||||
Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
|
||||
|
||||
|
||||
@@ -639,10 +639,10 @@ class AnimateDiffPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
||||
`np.array`.
|
||||
|
||||
@@ -52,14 +52,21 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from io import BytesIO
|
||||
>>> from PIL import Image
|
||||
|
||||
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
>>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
|
||||
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
|
||||
>>> adapter = MotionAdapter.from_pretrained(
|
||||
... "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
|
||||
... )
|
||||
>>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
|
||||
... "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
|
||||
... ).to("cuda")
|
||||
>>> pipe.scheduler = DDIMScheduler(
|
||||
... beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
|
||||
... )
|
||||
|
||||
|
||||
>>> def load_video(file_path: str):
|
||||
... images = []
|
||||
...
|
||||
... if file_path.startswith(('http://', 'https://')):
|
||||
|
||||
... if file_path.startswith(("http://", "https://")):
|
||||
... # If the file_path is a URL
|
||||
... response = requests.get(file_path)
|
||||
... response.raise_for_status()
|
||||
@@ -68,15 +75,20 @@ EXAMPLE_DOC_STRING = """
|
||||
... else:
|
||||
... # Assuming it's a local file path
|
||||
... vid = imageio.get_reader(file_path)
|
||||
...
|
||||
|
||||
... for frame in vid:
|
||||
... pil_image = Image.fromarray(frame)
|
||||
... images.append(pil_image)
|
||||
...
|
||||
|
||||
... return images
|
||||
|
||||
>>> video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
|
||||
>>> output = pipe(video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5)
|
||||
|
||||
>>> video = load_video(
|
||||
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
|
||||
... )
|
||||
>>> output = pipe(
|
||||
... video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
|
||||
... )
|
||||
>>> frames = output.frames[0]
|
||||
>>> export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
@@ -135,8 +147,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -799,16 +811,15 @@ class AnimateDiffVideoToVideoPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
||||
`np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`AnimateDiffPipelineOutput`] instead
|
||||
of a plain tuple.
|
||||
Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
|
||||
@@ -15,7 +15,8 @@ class AnimateDiffPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
||||
`(batch_size, num_frames, channels, height, width)`
|
||||
"""
|
||||
|
||||
@@ -701,8 +701,8 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
|
||||
a `tuple` is returned where the first element is the sample tensor.
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
|
||||
otherwise a `tuple` is returned where the first element is the sample tensor.
|
||||
"""
|
||||
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
||||
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
|
||||
|
||||
@@ -107,8 +107,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -922,9 +922,9 @@ class StableDiffusionControlNetPipeline(
|
||||
accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
|
||||
and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
|
||||
`init`, images must be passed as a list such that each element of the list can be correctly batched for
|
||||
input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet,
|
||||
each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets,
|
||||
where a list of image lists can be passed to batch for each prompt and each ControlNet.
|
||||
input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single
|
||||
ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple
|
||||
ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet.
|
||||
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
@@ -962,10 +962,10 @@ class StableDiffusionControlNetPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -978,10 +978,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -1167,11 +1167,12 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The width in pixels of the generated image.
|
||||
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
|
||||
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
|
||||
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
|
||||
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
|
||||
and contain information irrelevant for inpainting, such as background.
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
||||
image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
|
||||
with the same aspect ration of the image and contains all masked area, and then expand that area based
|
||||
on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
|
||||
resizing to the original image size for inpainting. This is useful when the masked area is small while
|
||||
the image is large and contain information irrelevant for inpainting, such as background.
|
||||
strength (`float`, *optional*, defaults to 1.0):
|
||||
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
|
||||
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
|
||||
@@ -1207,10 +1208,10 @@ class StableDiffusionControlNetInpaintPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -1194,11 +1194,12 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The width in pixels of the generated image.
|
||||
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
|
||||
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
|
||||
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
|
||||
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
|
||||
and contain information irrelevant for inpainting, such as background.
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
||||
image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
|
||||
with the same aspect ration of the image and contains all masked area, and then expand that area based
|
||||
on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
|
||||
resizing to the original image size for inpainting. This is useful when the masked area is small while
|
||||
the image is large and contain information irrelevant for inpainting, such as background.
|
||||
strength (`float`, *optional*, defaults to 0.9999):
|
||||
Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
|
||||
between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
|
||||
@@ -1247,10 +1248,10 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
||||
argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1039,10 +1039,10 @@ class StableDiffusionXLControlNetPipeline(
|
||||
argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -1178,10 +1178,10 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -89,8 +89,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
|
||||
@@ -129,8 +129,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
|
||||
@@ -1000,8 +1000,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
|
||||
def fuse_qkv_projections(self):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
@@ -1112,8 +1112,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
|
||||
a `tuple` is returned where the first element is the sample tensor.
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
|
||||
otherwise a `tuple` is returned where the first element is the sample tensor.
|
||||
"""
|
||||
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
||||
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
|
||||
|
||||
@@ -41,20 +41,20 @@ class FreeInitMixin:
|
||||
num_iters (`int`, *optional*, defaults to `3`):
|
||||
Number of FreeInit noise re-initialization iterations.
|
||||
use_fast_sampling (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
|
||||
the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
|
||||
Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
|
||||
"Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
|
||||
method (`str`, *optional*, defaults to `butterworth`):
|
||||
Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
|
||||
FreeInit low pass filter.
|
||||
Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
|
||||
pass filter.
|
||||
order (`int`, *optional*, defaults to `4`):
|
||||
Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
|
||||
whereas lower values lead to `gaussian` method behaviour.
|
||||
spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
|
||||
Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
|
||||
the original implementation.
|
||||
Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
|
||||
original implementation.
|
||||
temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
|
||||
Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
|
||||
the original implementation.
|
||||
Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
|
||||
original implementation.
|
||||
"""
|
||||
self._free_init_num_iters = num_iters
|
||||
self._free_init_use_fast_sampling = use_fast_sampling
|
||||
|
||||
@@ -43,10 +43,14 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers import I2VGenXLPipeline
|
||||
>>> from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
>>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
|
||||
>>> pipeline = I2VGenXLPipeline.from_pretrained(
|
||||
... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
|
||||
... )
|
||||
>>> pipeline.enable_model_cpu_offload()
|
||||
|
||||
>>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
||||
>>> image_url = (
|
||||
... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
||||
... )
|
||||
>>> image = load_image(image_url).convert("RGB")
|
||||
|
||||
>>> prompt = "Papers were floating in the air on a table in the library"
|
||||
@@ -59,7 +63,7 @@ EXAMPLE_DOC_STRING = """
|
||||
... num_inference_steps=50,
|
||||
... negative_prompt=negative_prompt,
|
||||
... guidance_scale=9.0,
|
||||
... generator=generator
|
||||
... generator=generator,
|
||||
... ).frames[0]
|
||||
>>> video_path = export_to_gif(frames, "i2v.gif")
|
||||
```
|
||||
@@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
||||
`(batch_size, num_frames, channels, height, width)`
|
||||
"""
|
||||
@@ -551,7 +556,8 @@ class I2VGenXLPipeline(
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The width in pixels of the generated image.
|
||||
target_fps (`int`, *optional*):
|
||||
Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation.
|
||||
Frames per second. The rate at which the generated images shall be exported to a video after
|
||||
generation. This is also used as a "micro-condition" while generation.
|
||||
num_frames (`int`, *optional*):
|
||||
The number of video frames to generate.
|
||||
num_inference_steps (`int`, *optional*):
|
||||
@@ -568,9 +574,9 @@ class I2VGenXLPipeline(
|
||||
num_videos_per_prompt (`int`, *optional*):
|
||||
The number of images to generate per prompt.
|
||||
decode_chunk_size (`int`, *optional*):
|
||||
The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
|
||||
between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
|
||||
for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
||||
The number of frames to decode at a time. The higher the chunk size, the higher the temporal
|
||||
consistency between frames, but also the higher the memory consumption. By default, the decoder will
|
||||
decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
generation deterministic.
|
||||
|
||||
@@ -35,10 +35,10 @@ DYNAMIC_MAP = {
|
||||
|
||||
def convert_state_dict(unet_state_dict):
|
||||
"""
|
||||
Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
|
||||
Args:
|
||||
unet_model (torch.nn.Module): The original U-Net model.
|
||||
unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with.
|
||||
Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
|
||||
unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
|
||||
model to match keys with.
|
||||
|
||||
Returns:
|
||||
OrderedDict: The converted state dictionary.
|
||||
|
||||
@@ -24,7 +24,9 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers import AutoPipelineForText2Image
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
|
||||
>>> pipe = AutoPipelineForText2Image.from_pretrained(
|
||||
... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
|
||||
... )
|
||||
>>> pipe.enable_model_cpu_offload()
|
||||
|
||||
>>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
|
||||
|
||||
@@ -29,11 +29,15 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers.utils import load_image
|
||||
>>> import torch
|
||||
|
||||
>>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
|
||||
>>> pipe = AutoPipelineForImage2Image.from_pretrained(
|
||||
... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
|
||||
... )
|
||||
>>> pipe.enable_model_cpu_offload()
|
||||
|
||||
>>> prompt = "A painting of the inside of a subway train with tiny raccoons."
|
||||
>>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png")
|
||||
>>> image = load_image(
|
||||
... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
|
||||
... )
|
||||
|
||||
>>> generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
>>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
|
||||
|
||||
+6
-6
@@ -73,8 +73,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -749,10 +749,10 @@ class LatentConsistencyModelImg2ImgPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
+6
-6
@@ -77,8 +77,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -681,10 +681,10 @@ class LatentConsistencyModelPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from io import BytesIO
|
||||
|
||||
>>> from diffusers import LEditsPPPipelineStableDiffusion
|
||||
>>> from diffusers.utils import load_image
|
||||
|
||||
>>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
|
||||
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
|
||||
... )
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> def download_image(url):
|
||||
... response = requests.get(url)
|
||||
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
|
||||
>>> image = download_image(img_url)
|
||||
>>> image = load_image(img_url).convert("RGB")
|
||||
|
||||
>>> _ = pipe.invert(
|
||||
... image = image,
|
||||
... num_inversion_steps=50,
|
||||
... skip=0.1
|
||||
... )
|
||||
>>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
|
||||
|
||||
>>> edited_image = pipe(
|
||||
... editing_prompt=["cherry blossom"],
|
||||
... edit_guidance_scale=10.0,
|
||||
... edit_threshold=0.75,
|
||||
).images[0]
|
||||
... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
|
||||
... ).images[0]
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion(
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
|
||||
be set to [`DPMSolverMultistepScheduler`].
|
||||
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
||||
automatically be set to [`DPMSolverMultistepScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
@@ -531,8 +522,7 @@ class LEditsPPPipelineStableDiffusion(
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
less than `1`).
|
||||
editing_prompt (`str` or `List[str]`, *optional*):
|
||||
Editing prompt(s) to be encoded. If not defined, one has to pass
|
||||
`editing_prompt_embeds` instead.
|
||||
Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
|
||||
editing_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion(
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`]
|
||||
method has to be called beforehand. Edits will always be performed for the last inverted image(s).
|
||||
The call function to the pipeline for editing. The
|
||||
[`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
|
||||
always be performed for the last inverted image(s).
|
||||
|
||||
Args:
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion(
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
|
||||
tuple.
|
||||
editing_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. The image is reconstructed by setting
|
||||
`editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
|
||||
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
||||
`reverse_editing_direction`.
|
||||
editing_prompt_embeds (`torch.Tensor>`, *optional*):
|
||||
Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be
|
||||
specified via `reverse_editing_direction`.
|
||||
Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
|
||||
be specified via `reverse_editing_direction`.
|
||||
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
|
||||
Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
|
||||
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
|
||||
Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
|
||||
`edit_guidance_scale` is defined as `s_e` of equation 12 of
|
||||
[LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
||||
`editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247).
|
||||
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
|
||||
Number of diffusion steps (for each prompt) for which guidance will not be applied.
|
||||
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
|
||||
Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
|
||||
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
|
||||
Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
|
||||
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247).
|
||||
user_mask (`torch.FloatTensor`, *optional*):
|
||||
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
|
||||
masks do not meet user preferences.
|
||||
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
|
||||
implicit masks do not meet user preferences.
|
||||
sem_guidance (`List[torch.Tensor]`, *optional*):
|
||||
List of pre-generated guidance vectors to be applied at generation. Length of the list has to
|
||||
correspond to `num_inference_steps`.
|
||||
use_cross_attn_mask (`bool`, defaults to `False`):
|
||||
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
|
||||
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
|
||||
[LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
|
||||
paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
use_intersect_mask (`bool`, defaults to `True`):
|
||||
Whether the masking term is calculated as intersection of cross-attention masks and masks derived
|
||||
from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
|
||||
estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
|
||||
the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
|
||||
are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
attn_store_steps (`List[int]`, *optional*):
|
||||
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
|
||||
store_averaged_over_steps (`bool`, defaults to `True`):
|
||||
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
|
||||
If False, attention maps for each step are stores separately. Just for visualization purposes.
|
||||
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
|
||||
False, attention maps for each step are stores separately. Just for visualization purposes.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion(
|
||||
|
||||
Returns:
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
|
||||
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
|
||||
second element is a list of `bool`s denoting whether the corresponding generated image likely represents
|
||||
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
|
||||
returning a tuple, the first element is a list with the generated images, and the second element is a list
|
||||
of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
|
||||
content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if self.inversion_steps is None:
|
||||
@@ -1219,9 +1212,9 @@ class LEditsPPPipelineStableDiffusion(
|
||||
crops_coords: Optional[Tuple[int, int, int, int]] = None,
|
||||
):
|
||||
r"""
|
||||
The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
|
||||
will be performed instead.
|
||||
The function to the pipeline for image inversion as described by the [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
||||
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
||||
|
||||
Args:
|
||||
image (`PipelineImageInput`):
|
||||
@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion(
|
||||
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
|
||||
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
inversion deterministic.
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
|
||||
deterministic.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion(
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
height (`int`, *optional*, defaults to `None`):
|
||||
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
|
||||
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
|
||||
height.
|
||||
width (`int`, *optional*`, defaults to `None`):
|
||||
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
||||
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
||||
resize_mode (`str`, *optional*, defaults to `default`):
|
||||
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
|
||||
within the specified width and height, and it may not maintaining the original aspect ratio.
|
||||
If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, filling empty with data from image.
|
||||
If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
|
||||
within the dimensions, cropping the excess.
|
||||
Note that resize_mode `fill` and `crop` are only supported for PIL image input.
|
||||
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
|
||||
the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
|
||||
resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
|
||||
center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
|
||||
image to fit within the specified width and height, maintaining the aspect ratio, and then center the
|
||||
image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
|
||||
supported for PIL image input.
|
||||
crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
|
||||
The crop coordinates for each image in the batch. If `None`, will not crop the image.
|
||||
|
||||
Returns:
|
||||
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
|
||||
Output will contain the resized input image(s) and respective VAE reconstruction(s).
|
||||
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
|
||||
and respective VAE reconstruction(s).
|
||||
"""
|
||||
# Reset attn processor, we do not want to store attn maps during inversion
|
||||
self.unet.set_attn_processor(AttnProcessor())
|
||||
|
||||
@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """
|
||||
... )
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
|
||||
>>> def download_image(url):
|
||||
... response = requests.get(url)
|
||||
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
|
||||
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
|
||||
>>> image = download_image(img_url)
|
||||
|
||||
>>> _ = pipe.invert(
|
||||
... image = image,
|
||||
... num_inversion_steps=50,
|
||||
... skip=0.2
|
||||
... )
|
||||
>>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
|
||||
|
||||
>>> edited_image = pipe(
|
||||
... editing_prompt=["tennis ball","tomato"],
|
||||
... reverse_editing_direction=[True,False],
|
||||
... edit_guidance_scale=[5.0,10.0],
|
||||
... edit_threshold=[0.9,0.85],
|
||||
).images[0]
|
||||
... editing_prompt=["tennis ball", "tomato"],
|
||||
... reverse_editing_direction=[True, False],
|
||||
... edit_guidance_scale=[5.0, 10.0],
|
||||
... edit_threshold=[0.9, 0.85],
|
||||
... ).images[0]
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
"""
|
||||
Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass
|
||||
documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
|
||||
device, etc.).
|
||||
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
|
||||
superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
|
||||
particular device, etc.).
|
||||
|
||||
In addition the pipeline inherits the following loading methods:
|
||||
- *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
|
||||
@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
|
||||
be set to [`DPMSolverMultistepScheduler`].
|
||||
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
||||
automatically be set to [`DPMSolverMultistepScheduler`].
|
||||
force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
|
||||
Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
|
||||
`stabilityai/stable-diffusion-xl-base-1-0`.
|
||||
@@ -453,9 +451,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
|
||||
`editing_prompt_embeds` instead.
|
||||
editing_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input
|
||||
argument.
|
||||
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
|
||||
`editing_prompt` input argument.
|
||||
editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
|
||||
@@ -835,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`]
|
||||
method has to be called beforehand. Edits will always be performed for the last inverted image(s).
|
||||
The call function to the pipeline for editing. The
|
||||
[`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
|
||||
will always be performed for the last inverted image(s).
|
||||
|
||||
Args:
|
||||
denoising_end (`float`, *optional*):
|
||||
@@ -894,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
|
||||
editing_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. The image is reconstructed by setting
|
||||
`editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
|
||||
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
||||
`reverse_editing_direction`.
|
||||
editing_prompt_embeddings (`torch.Tensor`, *optional*):
|
||||
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
|
||||
argument.
|
||||
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
||||
If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
|
||||
editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
|
||||
Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
|
||||
@@ -906,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
|
||||
Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
|
||||
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
|
||||
Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
|
||||
`edit_guidance_scale` is defined as `s_e` of equation 12 of
|
||||
[LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
||||
`editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247).
|
||||
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
|
||||
Number of diffusion steps (for each prompt) for which guidance is not applied.
|
||||
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
|
||||
Number of diffusion steps (for each prompt) after which guidance is no longer applied.
|
||||
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
|
||||
Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
|
||||
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247).
|
||||
sem_guidance (`List[torch.Tensor]`, *optional*):
|
||||
List of pre-generated guidance vectors to be applied at generation. Length of the list has to
|
||||
correspond to `num_inference_steps`.
|
||||
use_cross_attn_mask:
|
||||
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
|
||||
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
|
||||
[LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
|
||||
paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
use_intersect_mask:
|
||||
Whether the masking term is calculated as intersection of cross-attention masks and masks derived
|
||||
from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
|
||||
estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
|
||||
the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
|
||||
are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
||||
user_mask:
|
||||
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
|
||||
masks do not meet user preferences.
|
||||
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
|
||||
implicit masks do not meet user preferences.
|
||||
attn_store_steps:
|
||||
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
|
||||
store_averaged_over_steps:
|
||||
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
|
||||
If False, attention maps for each step are stores separately. Just for visualization purposes.
|
||||
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
|
||||
False, attention maps for each step are stores separately. Just for visualization purposes.
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
@@ -952,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
|
||||
Returns:
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
|
||||
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
|
||||
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
|
||||
returning a tuple, the first element is a list with the generated images.
|
||||
"""
|
||||
if self.inversion_steps is None:
|
||||
raise ValueError(
|
||||
@@ -1446,9 +1446,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
r"""
|
||||
The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
|
||||
If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
|
||||
will be performed instead.
|
||||
The function to the pipeline for image inversion as described by the [LEDITS++
|
||||
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
||||
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
||||
|
||||
Args:
|
||||
image (`PipelineImageInput`):
|
||||
@@ -1472,8 +1472,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
|
||||
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
inversion deterministic.
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
|
||||
deterministic.
|
||||
crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
||||
`crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
|
||||
`crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
|
||||
@@ -1488,8 +1488,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
|
||||
Returns:
|
||||
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
|
||||
Output will contain the resized input image(s) and respective VAE reconstruction(s).
|
||||
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
|
||||
and respective VAE reconstruction(s).
|
||||
"""
|
||||
|
||||
# Reset attn processor, we do not want to store attn maps during inversion
|
||||
|
||||
@@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput):
|
||||
List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
|
||||
(batch_size, height, width, num_channels)`.
|
||||
vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
|
||||
List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape `
|
||||
(batch_size, height, width, num_channels)`.
|
||||
List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape
|
||||
` (batch_size, height, width, num_channels)`.
|
||||
"""
|
||||
|
||||
images: Union[List[PIL.Image.Image], np.ndarray]
|
||||
|
||||
@@ -59,6 +59,7 @@ EXAMPLE_DOC_STRING = """
|
||||
... PIAPipeline,
|
||||
... )
|
||||
>>> from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
>>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers")
|
||||
>>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter)
|
||||
>>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
|
||||
@@ -135,9 +136,9 @@ class PIAPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
|
||||
NumPy array of shape `(batch_size, num_frames, channels, height, width,
|
||||
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
|
||||
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of
|
||||
shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames,
|
||||
channels, height, width)`.
|
||||
"""
|
||||
|
||||
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
|
||||
@@ -759,16 +760,15 @@ class PIAPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
motion_scale: (`int`, *optional*, defaults to 0):
|
||||
Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific
|
||||
ranges of values control the type of motion that is added. Must be between 0 and 8.
|
||||
Set between 0-2 to only increase the amount of motion.
|
||||
Set between 3-5 to create looping motion.
|
||||
Set between 6-8 to perform motion with image style transfer.
|
||||
Parameter that controls the amount and type of motion that is added to the image. Increasing the value
|
||||
increases the amount of motion, while specific ranges of values control the type of motion that is
|
||||
added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5
|
||||
to create looping motion. Set between 6-8 to perform motion with image style transfer.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
||||
`np.array`.
|
||||
@@ -795,8 +795,8 @@ class PIAPipeline(
|
||||
|
||||
Returns:
|
||||
[`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is
|
||||
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
|
||||
If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a
|
||||
`tuple` is returned where the first element is a list with the generated frames.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
@@ -538,7 +538,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
allowed by Git.
|
||||
custom_revision (`str`, *optional*):
|
||||
The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
|
||||
`revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version.
|
||||
`revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers
|
||||
version.
|
||||
mirror (`str`, *optional*):
|
||||
Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
|
||||
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
|
||||
@@ -1669,7 +1670,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
@classmethod
|
||||
def from_pipe(cls, pipeline, **kwargs):
|
||||
r"""
|
||||
Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing pipeline components without reallocating additional memory.
|
||||
Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing
|
||||
pipeline components without reallocating additional memory.
|
||||
|
||||
Arguments:
|
||||
pipeline (`DiffusionPipeline`):
|
||||
@@ -1851,8 +1853,8 @@ class StableDiffusionMixin:
|
||||
|
||||
def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
||||
are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -186,8 +186,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
|
||||
@@ -334,8 +334,8 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
|
||||
argument.
|
||||
negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
|
||||
argument.
|
||||
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
|
||||
input argument.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers import StableCascadeCombinedPipeline
|
||||
>>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
|
||||
|
||||
>>> pipe = StableCascadeCombinedPipeline.from_pretrained(
|
||||
... "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
|
||||
... )
|
||||
>>> pipe.enable_model_cpu_offload()
|
||||
>>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
>>> images = pipe(prompt=prompt)
|
||||
|
||||
@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline):
|
||||
prior ([`StableCascadeUNet`]):
|
||||
The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
|
||||
text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
|
||||
Frozen text-encoder
|
||||
([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
|
||||
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `image_encoder`.
|
||||
image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
@@ -420,11 +421,11 @@ class StableCascadePriorPipeline(DiffusionPipeline):
|
||||
argument.
|
||||
negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
|
||||
argument.
|
||||
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
|
||||
input argument.
|
||||
image_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
|
||||
If not provided, image embeddings will be generated from `image` input argument if existing.
|
||||
Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
|
||||
not provided, image embeddings will be generated from `image` input argument if existing.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline):
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
|
||||
`return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
|
||||
generated image embeddings.
|
||||
[`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
|
||||
True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
|
||||
embeddings.
|
||||
"""
|
||||
|
||||
# 0. Define commonly used variables
|
||||
|
||||
@@ -85,8 +85,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -801,10 +801,10 @@ class StableDiffusionPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -125,8 +125,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -897,10 +897,10 @@ class StableDiffusionImg2ImgPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -189,8 +189,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -1022,11 +1022,12 @@ class StableDiffusionInpaintPipeline(
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The width in pixels of the generated image.
|
||||
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
|
||||
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
|
||||
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
|
||||
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
|
||||
and contain information irrelevant for inpainting, such as background.
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
||||
image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
|
||||
with the same aspect ration of the image and contains all masked area, and then expand that area based
|
||||
on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
|
||||
resizing to the original image size for inpainting. This is useful when the masked area is small while
|
||||
the image is large and contain information irrelevant for inpainting, such as background.
|
||||
strength (`float`, *optional*, defaults to 1.0):
|
||||
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
|
||||
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
|
||||
@@ -1066,10 +1067,10 @@ class StableDiffusionInpaintPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -90,8 +90,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -773,10 +773,10 @@ class StableDiffusionLDM3DPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
+13
-13
@@ -90,8 +90,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -694,9 +694,9 @@ class StableDiffusionPanoramaPipeline(
|
||||
circular_padding: bool = False,
|
||||
) -> List[Tuple[int, int, int, int]]:
|
||||
"""
|
||||
Generates a list of views based on the given parameters.
|
||||
Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113).
|
||||
If panorama's height/width < window_size, num_blocks of height/width should return 1.
|
||||
Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the
|
||||
MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of
|
||||
height/width should return 1.
|
||||
|
||||
Args:
|
||||
panorama_height (int): The height of the panorama.
|
||||
@@ -706,8 +706,8 @@ class StableDiffusionPanoramaPipeline(
|
||||
circular_padding (bool, optional): Whether to apply circular padding. Defaults to False.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains
|
||||
four integers representing the start and end coordinates of the window in the panorama.
|
||||
List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers
|
||||
representing the start and end coordinates of the window in the panorama.
|
||||
|
||||
"""
|
||||
panorama_height /= 8
|
||||
@@ -800,8 +800,8 @@ class StableDiffusionPanoramaPipeline(
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
The timesteps at which to generate the images. If not specified, then the default
|
||||
timestep spacing strategy of the scheduler is used.
|
||||
The timesteps at which to generate the images. If not specified, then the default timestep spacing
|
||||
strategy of the scheduler is used.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
A higher guidance scale value encourages the model to generate images closely linked to the text
|
||||
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
||||
@@ -832,10 +832,10 @@ class StableDiffusionPanoramaPipeline(
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -619,8 +619,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
|
||||
`ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -117,8 +117,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -919,10 +919,10 @@ class StableDiffusionXLPipeline(
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -134,8 +134,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -1067,10 +1067,10 @@ class StableDiffusionXLImg2ImgPipeline(
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
+12
-11
@@ -279,8 +279,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -1255,11 +1255,12 @@ class StableDiffusionXLInpaintPipeline(
|
||||
[stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
||||
and checkpoints that are not specifically fine-tuned on low resolutions.
|
||||
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
|
||||
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
|
||||
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
|
||||
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
|
||||
and contain information irrelevant for inpainting, such as background.
|
||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
||||
image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
|
||||
with the same aspect ration of the image and contains all masked area, and then expand that area based
|
||||
on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
|
||||
resizing to the original image size for inpainting. This is useful when the masked area is small while
|
||||
the image is large and contain information irrelevant for inpainting, such as background.
|
||||
strength (`float`, *optional*, defaults to 0.9999):
|
||||
Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
|
||||
between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
|
||||
@@ -1319,10 +1320,10 @@ class StableDiffusionXLInpaintPipeline(
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
|
||||
@@ -37,10 +37,14 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers import StableVideoDiffusionPipeline
|
||||
>>> from diffusers.utils import load_image, export_to_video
|
||||
|
||||
>>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
|
||||
>>> pipe = StableVideoDiffusionPipeline.from_pretrained(
|
||||
... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
|
||||
... )
|
||||
>>> pipe.to("cuda")
|
||||
|
||||
>>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
|
||||
>>> image = load_image(
|
||||
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
|
||||
... )
|
||||
>>> image = image.resize((1024, 576))
|
||||
|
||||
>>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
|
||||
@@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
|
||||
List of denoised PIL images of length `batch_size` or numpy array or torch tensor
|
||||
of shape `(batch_size, num_frames, height, width, num_channels)`.
|
||||
List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
|
||||
num_frames, height, width, num_channels)`.
|
||||
"""
|
||||
|
||||
frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
|
||||
@@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
||||
vae ([`AutoencoderKLTemporalDecoder`]):
|
||||
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
|
||||
image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
|
||||
Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
|
||||
Frozen CLIP image-encoder
|
||||
([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
|
||||
unet ([`UNetSpatioTemporalConditionModel`]):
|
||||
A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
|
||||
scheduler ([`EulerDiscreteScheduler`]):
|
||||
@@ -357,14 +362,15 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
Args:
|
||||
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
|
||||
Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
|
||||
Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
|
||||
1]`.
|
||||
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
||||
The width in pixels of the generated image.
|
||||
num_frames (`int`, *optional*):
|
||||
The number of video frames to generate. Defaults to `self.unet.config.num_frames`
|
||||
(14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
|
||||
The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
|
||||
`stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
|
||||
num_inference_steps (`int`, *optional*, defaults to 25):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
|
||||
expense of slower inference. This parameter is modulated by `strength`.
|
||||
@@ -373,16 +379,18 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
||||
max_guidance_scale (`float`, *optional*, defaults to 3.0):
|
||||
The maximum guidance scale. Used for the classifier free guidance with last frame.
|
||||
fps (`int`, *optional*, defaults to 7):
|
||||
Frames per second. The rate at which the generated images shall be exported to a video after generation.
|
||||
Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
|
||||
Frames per second. The rate at which the generated images shall be exported to a video after
|
||||
generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
|
||||
motion_bucket_id (`int`, *optional*, defaults to 127):
|
||||
Used for conditioning the amount of motion for the generation. The higher the number the more motion
|
||||
will be in the video.
|
||||
noise_aug_strength (`float`, *optional*, defaults to 0.02):
|
||||
The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
|
||||
The amount of noise added to the init image, the higher it is the less the video will look like the
|
||||
init image. Increase it for more motion.
|
||||
decode_chunk_size (`int`, *optional*):
|
||||
The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
|
||||
quality. For lower memory usage, reduce `decode_chunk_size`.
|
||||
The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
|
||||
expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
|
||||
For lower memory usage, reduce `decode_chunk_size`.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -398,7 +406,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
||||
A function that is called at the end of each denoising step during inference. The function is called
|
||||
with the following arguments:
|
||||
`callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
|
||||
`callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
||||
`callback_kwargs` will include a list of all tensors as specified by
|
||||
`callback_on_step_end_tensor_inputs`.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
@@ -411,8 +420,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
|
||||
otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
|
||||
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
|
||||
returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
|
||||
is returned.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
@@ -134,8 +134,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
|
||||
@@ -150,8 +150,8 @@ def retrieve_timesteps(
|
||||
scheduler (`SchedulerMixin`):
|
||||
The scheduler to get timesteps from.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
||||
`timesteps` must be `None`.
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
||||
must be `None`.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -943,10 +943,10 @@ class StableDiffusionXLAdapterPipeline(
|
||||
input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
||||
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
||||
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
||||
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -17,7 +17,8 @@ class TextToVideoSDPipelineOutput(BaseOutput):
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
|
||||
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised
|
||||
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
||||
`(batch_size, num_frames, channels, height, width)`
|
||||
"""
|
||||
|
||||
@@ -752,7 +752,8 @@ class UTransformer2DModel(ModelMixin, ConfigMixin):
|
||||
cross_attention_kwargs (*optional*):
|
||||
Keyword arguments to supply to the cross attention layers, if used.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
|
||||
Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
|
||||
tuple.
|
||||
hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
|
||||
ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
|
||||
|
||||
@@ -85,7 +85,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
trained_betas (`jnp.ndarray`, optional):
|
||||
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
|
||||
clip_sample (`bool`, default `True`):
|
||||
option to clip predicted sample between for numerical stability. The clip range is determined by `clip_sample_range`.
|
||||
option to clip predicted sample between for numerical stability. The clip range is determined by
|
||||
`clip_sample_range`.
|
||||
clip_sample_range (`float`, default `1.0`):
|
||||
the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
|
||||
set_alpha_to_one (`bool`, default `True`):
|
||||
|
||||
@@ -166,8 +166,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
|
||||
`lambda(t)`.
|
||||
final_sigmas_type (`str`, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
|
||||
is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
lambda_min_clipped (`float`, defaults to `-inf`):
|
||||
Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
|
||||
cosine (`squaredcos_cap_v2`) noise schedule.
|
||||
|
||||
@@ -108,11 +108,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
|
||||
`algorithm_type="dpmsolver++"`.
|
||||
algorithm_type (`str`, defaults to `dpmsolver++`):
|
||||
Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The
|
||||
`dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
|
||||
paper, and the `dpmsolver++` type implements the algorithms in the
|
||||
[DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
|
||||
`sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
|
||||
Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
|
||||
algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type
|
||||
implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is
|
||||
recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in
|
||||
Stable Diffusion.
|
||||
solver_type (`str`, defaults to `midpoint`):
|
||||
Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
|
||||
sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
|
||||
@@ -123,8 +123,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
|
||||
the sigmas are determined according to a sequence of noise levels {σi}.
|
||||
final_sigmas_type (`str`, *optional*, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
|
||||
is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
lambda_min_clipped (`float`, defaults to `-inf`):
|
||||
Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
|
||||
cosine (`squaredcos_cap_v2`) noise schedule.
|
||||
|
||||
@@ -62,10 +62,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
|
||||
`algorithm_type="dpmsolver++"`.
|
||||
algorithm_type (`str`, defaults to `dpmsolver++`):
|
||||
Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The
|
||||
`dpmsolver++` type implements the algorithms in the
|
||||
[DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
|
||||
`sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
|
||||
Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver++` type implements
|
||||
the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to
|
||||
use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
|
||||
solver_type (`str`, defaults to `midpoint`):
|
||||
Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
|
||||
sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
|
||||
@@ -77,8 +76,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
|
||||
steps, but sometimes may result in blurring.
|
||||
final_sigmas_type (`str`, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
|
||||
is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
"""
|
||||
|
||||
_compatibles = []
|
||||
|
||||
@@ -278,8 +278,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`):
|
||||
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or
|
||||
tuple.
|
||||
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`:
|
||||
|
||||
@@ -92,19 +92,20 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
predictor_order (`int`, defaults to 2):
|
||||
The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
|
||||
sampling, and `predictor_order=3` for unconditional sampling.
|
||||
The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for
|
||||
guided sampling, and `predictor_order=3` for unconditional sampling.
|
||||
corrector_order (`int`, defaults to 2):
|
||||
The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
|
||||
sampling, and `corrector_order=3` for unconditional sampling.
|
||||
The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for
|
||||
guided sampling, and `corrector_order=3` for unconditional sampling.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
tau_func (`Callable`, *optional*):
|
||||
Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver
|
||||
will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla
|
||||
diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019
|
||||
Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`.
|
||||
SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample
|
||||
from vanilla diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check
|
||||
https://arxiv.org/abs/2309.05019
|
||||
thresholding (`bool`, defaults to `False`):
|
||||
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
|
||||
as Stable Diffusion.
|
||||
@@ -114,8 +115,8 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
|
||||
`algorithm_type="dpmsolver++"`.
|
||||
algorithm_type (`str`, defaults to `data_prediction`):
|
||||
Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
|
||||
with `solver_order=2` for guided sampling like in Stable Diffusion.
|
||||
Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use
|
||||
`data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion.
|
||||
lower_order_final (`bool`, defaults to `True`):
|
||||
Whether to use lower-order solvers in the final steps. Default = True.
|
||||
use_karras_sigmas (`bool`, *optional*, defaults to `False`):
|
||||
@@ -402,14 +403,14 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
**kwargs,
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is
|
||||
designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an
|
||||
integral of the data prediction model.
|
||||
Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs.
|
||||
Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is
|
||||
designed to discretize an integral of the data prediction model.
|
||||
|
||||
<Tip>
|
||||
|
||||
The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise
|
||||
prediction and data prediction models.
|
||||
The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both
|
||||
noise prediction and data prediction models.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -132,8 +132,8 @@ def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
|
||||
|
||||
class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
`TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency Distillation`,
|
||||
extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
|
||||
`TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency
|
||||
Distillation`, extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
|
||||
|
||||
This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD).
|
||||
|
||||
@@ -543,8 +543,9 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
sample (`torch.FloatTensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
eta (`float`):
|
||||
A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every step.
|
||||
When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling.
|
||||
A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every
|
||||
step. When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic
|
||||
sampling.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
|
||||
@@ -128,8 +128,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
final_sigmas_type (`str`, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
|
||||
is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
"""
|
||||
|
||||
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
|
||||
|
||||
@@ -246,8 +246,8 @@ def get_cached_module_file(
|
||||
|
||||
<Tip>
|
||||
|
||||
You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
|
||||
or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
|
||||
You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
|
||||
[gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -434,8 +434,8 @@ def get_class_from_dynamic_module(
|
||||
|
||||
<Tip>
|
||||
|
||||
You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
|
||||
or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
|
||||
You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
|
||||
[gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -112,7 +112,8 @@ def load_or_create_model_card(
|
||||
repo_id_or_path (`str`):
|
||||
The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card.
|
||||
token (`str`, *optional*):
|
||||
Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more details.
|
||||
Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
|
||||
details.
|
||||
is_pipeline (`bool`):
|
||||
Boolean to indicate if we're adding tag to a [`DiffusionPipeline`].
|
||||
from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script.
|
||||
|
||||
@@ -16,8 +16,8 @@ def load_image(
|
||||
image (`str` or `PIL.Image.Image`):
|
||||
The image to convert to the PIL Image format.
|
||||
convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], optional):
|
||||
A conversion method to apply to the image after loading it.
|
||||
When set to `None` the image will be converted "RGB".
|
||||
A conversion method to apply to the image after loading it. When set to `None` the image will be converted
|
||||
"RGB".
|
||||
|
||||
Returns:
|
||||
`PIL.Image.Image`:
|
||||
|
||||
@@ -253,8 +253,8 @@ def convert_unet_state_dict_to_peft(state_dict):
|
||||
|
||||
def convert_all_state_dict_to_peft(state_dict):
|
||||
r"""
|
||||
Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer`
|
||||
for a valid `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft`
|
||||
Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid
|
||||
`DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft`
|
||||
"""
|
||||
try:
|
||||
peft_dict = convert_state_dict_to_peft(state_dict)
|
||||
|
||||
@@ -156,8 +156,8 @@ def get_tests_dir(append_path=None):
|
||||
# https://github.com/huggingface/accelerate/pull/1964
|
||||
def str_to_bool(value) -> int:
|
||||
"""
|
||||
Converts a string representation of truth to `True` (1) or `False` (0).
|
||||
True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
|
||||
Converts a string representation of truth to `True` (1) or `False` (0). True values are `y`, `yes`, `t`, `true`,
|
||||
`on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
|
||||
"""
|
||||
value = value.lower()
|
||||
if value in ("y", "yes", "t", "true", "on", "1"):
|
||||
|
||||
Reference in New Issue
Block a user