Fix typos in docs and comments (#11416)

* Fix typos in docs and comments

* Apply style fixes

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
co63oc 2025-05-01 14:30:53 +08:00 committed by GitHub
parent d70f8ee18b
commit 86294d3c7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
115 changed files with 165 additions and 164 deletions

View File

@ -966,7 +966,7 @@ pipe.to("cuda")
prompt = { prompt = {
0: "A caterpillar on a leaf, high quality, photorealistic", 0: "A caterpillar on a leaf, high quality, photorealistic",
40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic", 40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
80: "A cocoon on a leaf, flowers in the backgrond, photorealistic", 80: "A cocoon on a leaf, flowers in the background, photorealistic",
120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic", 120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic", 160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
200: "A beautiful butterfly, flying away in a forest, photorealistic", 200: "A beautiful butterfly, flying away in a forest, photorealistic",

View File

@ -29,7 +29,7 @@ You can find additional information about LEDITS++ on the [project page](https:/
</Tip> </Tip>
<Tip warning={true}> <Tip warning={true}>
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion. Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp). This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
</Tip> </Tip>

View File

@ -285,7 +285,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
image_encoder=image_encoder, image_encoder=image_encoder,
torch_dtype=torch.bfloat16 torch_dtype=torch.bfloat16
) )
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU # Since we've offloaded the larger models already, we can move the rest of the model components to GPU
pipe.to("cuda") pipe.to("cuda")
image = load_image( image = load_image(
@ -368,7 +368,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
image_encoder=image_encoder, image_encoder=image_encoder,
torch_dtype=torch.bfloat16 torch_dtype=torch.bfloat16
) )
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU # Since we've offloaded the larger models already, we can move the rest of the model components to GPU
pipe.to("cuda") pipe.to("cuda")
image = load_image( image = load_image(

View File

@ -485,7 +485,7 @@ image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2) image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image).resize((1024, 1216)) canny_image = Image.fromarray(image).resize((1024, 1216))
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
unet = UNet2DConditionModel.from_pretrained( unet = UNet2DConditionModel.from_pretrained(
"latent-consistency/lcm-sdxl", "latent-consistency/lcm-sdxl",
@ -551,7 +551,7 @@ image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2) image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image).resize((1024, 1024)) canny_image = Image.fromarray(image).resize((1024, 1024))
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
pipe = StableDiffusionXLAdapterPipeline.from_pretrained( pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", "stabilityai/stable-diffusion-xl-base-1.0",

View File

@ -154,11 +154,11 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
pipeline.enable_model_cpu_offload() pipeline.enable_model_cpu_offload()
``` ```
You can enable PAG on an exisiting inpainting pipeline like this You can enable PAG on an existing inpainting pipeline like this
```py ```py
pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16) pipeline_inpaint = AutoPipelineForInpainting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True) pipeline = AutoPipelineForInpainting.from_pipe(pipeline_inpaint, enable_pag=True)
``` ```
This still works when your pipeline has a different task: This still works when your pipeline has a different task:

View File

@ -125,7 +125,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
``` ```
You can also load a dataset straight from by specifying it's name in `dataset_name`. You can also load a dataset straight from by specifying it's name in `dataset_name`.
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset. Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer - **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
- **pivotal tuning** - **pivotal tuning**
@ -404,7 +404,7 @@ The advanced script now supports custom choice of U-net blocks to train during D
> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks. > In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
**Usage** **Usage**
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks. Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma separated string specifying the targeted blocks.
e.g: e.g:
```bash ```bash
--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" --lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"

View File

@ -141,7 +141,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
``` ```
You can also load a dataset straight from by specifying it's name in `dataset_name`. You can also load a dataset straight from by specifying it's name in `dataset_name`.
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset. Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer - **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
- **pivotal tuning** - **pivotal tuning**

View File

@ -1,6 +1,6 @@
## Amused training ## Amused training
Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates. Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipes are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size). All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).

View File

@ -201,7 +201,7 @@ Note that setting the `<ID_TOKEN>` is not necessary. From some limited experimen
> - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`. > - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
> - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results. > - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
> - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient. > - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`. > - When using the Prodigy optimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
> - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos. > - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
> >
> Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data. > Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.

View File

@ -879,7 +879,7 @@ def prepare_rotary_positional_embeddings(
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False): def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
# Use DeepSpeed optimzer # Use DeepSpeed optimizer
if use_deepspeed: if use_deepspeed:
from accelerate.utils import DummyOptim from accelerate.utils import DummyOptim

View File

@ -901,7 +901,7 @@ def prepare_rotary_positional_embeddings(
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False): def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
# Use DeepSpeed optimzer # Use DeepSpeed optimizer
if use_deepspeed: if use_deepspeed:
from accelerate.utils import DummyOptim from accelerate.utils import DummyOptim

View File

@ -4865,7 +4865,7 @@ python -m pip install intel_extension_for_pytorch
``` ```
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
``` ```
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16. 2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
```python ```python
pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device) pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)

View File

@ -336,13 +336,13 @@ if __name__ == "__main__":
expanded_kernel_width = np.ceil(kernel_width) + 2 expanded_kernel_width = np.ceil(kernel_width) + 2
# Determine a set of field_of_view for each each output position, these are the pixels in the input image # Determine a set of field_of_view for each each output position, these are the pixels in the input image
# that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the # that the pixel in the output image 'sees'. We get a matrix whose horizontal dim is the output pixels (big) and the
# vertical dim is the pixels it 'sees' (kernel_size + 2) # vertical dim is the pixels it 'sees' (kernel_size + 2)
field_of_view = np.squeeze( field_of_view = np.squeeze(
np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1) np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
) )
# Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the # Assign weight to each pixel in the field of view. A matrix whose horizontal dim is the output pixels and the
# vertical dim is a list of weights matching to the pixel in the field of view (that are specified in # vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
# 'field_of_view') # 'field_of_view')
weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1) weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)

View File

@ -201,16 +201,16 @@ class PAIntAAttnProcessor:
# ================================================== # # ================================================== #
# We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
# The other option would've been modifying the BasicTransformerBlock and adding this functionality here. # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead. # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack instead.
# The SelfAttention block recieves the normalized latents from the BasicTransformerBlock, # The SelfAttention block receives the normalized latents from the BasicTransformerBlock,
# But the residual of the output is the non-normalized version. # But the residual of the output is the non-normalized version.
# Therefore we unnormalize the input hidden state here # Therefore we unnormalize the input hidden state here
unnormalized_input_hidden_states = ( unnormalized_input_hidden_states = (
input_hidden_states + self.transformer_block.norm1.bias input_hidden_states + self.transformer_block.norm1.bias
) * self.transformer_block.norm1.weight ) * self.transformer_block.norm1.weight
# TODO: return if neccessary # TODO: return if necessary
# if self.use_ada_layer_norm_zero: # if self.use_ada_layer_norm_zero:
# attn_output = gate_msa.unsqueeze(1) * attn_output # attn_output = gate_msa.unsqueeze(1) * attn_output
# elif self.use_ada_layer_norm_single: # elif self.use_ada_layer_norm_single:
@ -220,7 +220,7 @@ class PAIntAAttnProcessor:
if transformer_hidden_states.ndim == 4: if transformer_hidden_states.ndim == 4:
transformer_hidden_states = transformer_hidden_states.squeeze(1) transformer_hidden_states = transformer_hidden_states.squeeze(1)
# TODO: return if neccessary # TODO: return if necessary
# 2.5 GLIGEN Control # 2.5 GLIGEN Control
# if gligen_kwargs is not None: # if gligen_kwargs is not None:
# transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"]) # transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
@ -266,7 +266,7 @@ class PAIntAAttnProcessor:
) = cross_attention_input_hidden_states.chunk(2) ) = cross_attention_input_hidden_states.chunk(2)
# Same split for the encoder_hidden_states i.e. the tokens # Same split for the encoder_hidden_states i.e. the tokens
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining. # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the beginning.
_encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk( _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
2 2
) )
@ -896,7 +896,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
class GaussianSmoothing(nn.Module): class GaussianSmoothing(nn.Module):
""" """
Apply gaussian smoothing on a Apply gaussian smoothing on a
1d, 2d or 3d tensor. Filtering is performed seperately for each channel 1d, 2d or 3d tensor. Filtering is performed separately for each channel
in the input using a depthwise convolution. in the input using a depthwise convolution.
Args: Args:

View File

@ -161,7 +161,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
be masked out with `mask_image` and repainted according to `prompt`. be masked out with `mask_image` and repainted according to `prompt`.
inner_image (`torch.Tensor` or `PIL.Image.Image`): inner_image (`torch.Tensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent `Image`, or tensor representing an image batch which will be overlaid onto `image`. Non-transparent
regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
the last channel representing the alpha channel, which will be used to blend `inner_image` with the last channel representing the alpha channel, which will be used to blend `inner_image` with
`image`. If not provided, it will be forcibly cast to RGBA. `image`. If not provided, it will be forcibly cast to RGBA.

View File

@ -647,7 +647,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
return sample return sample
def set_timesteps( def set_timesteps(
self, stength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None self, strength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
): ):
""" """
Sets the discrete timesteps used for the diffusion chain (to be run before inference). Sets the discrete timesteps used for the diffusion chain (to be run before inference).
@ -668,7 +668,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
# LCM Timesteps Setting: # Linear Spacing # LCM Timesteps Setting: # Linear Spacing
c = self.config.num_train_timesteps // lcm_origin_steps c = self.config.num_train_timesteps // lcm_origin_steps
lcm_origin_timesteps = ( lcm_origin_timesteps = (
np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1 np.asarray(list(range(1, int(lcm_origin_steps * strength) + 1))) * c - 1
) # LCM Training Steps Schedule ) # LCM Training Steps Schedule
skipping_step = len(lcm_origin_timesteps) // num_inference_steps skipping_step = len(lcm_origin_timesteps) // num_inference_steps
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule

View File

@ -129,7 +129,7 @@ class MagicMixPipeline(DiffusionPipeline):
input = ( input = (
(mix_factor * latents) + (1 - mix_factor) * orig_latents (mix_factor * latents) + (1 - mix_factor) * orig_latents
) # interpolating between layout noise and conditionally generated noise to preserve layout sematics ) # interpolating between layout noise and conditionally generated noise to preserve layout semantics
input = torch.cat([input] * 2) input = torch.cat([input] * 2)
else: # content generation phase else: # content generation phase

View File

@ -196,9 +196,9 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used. guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter. seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden. seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overridden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overridden.
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles. seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overridden using the given seed. Takes priority over seed_tiles.
cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues. cpu_vae: the decoder from latent space to pixel space can require too much GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
Examples: Examples:

View File

@ -1258,7 +1258,7 @@ class KolorsControlNetPipeline(
) )
if guess_mode and self.do_classifier_free_guidance: if guess_mode and self.do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch. # Inferred ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches, # To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged. # add 0 to the unconditional batch to keep it unchanged.
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]

View File

@ -1462,7 +1462,7 @@ class KolorsControlNetImg2ImgPipeline(
) )
if guess_mode and self.do_classifier_free_guidance: if guess_mode and self.do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch. # Inferred ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches, # To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged. # add 0 to the unconditional batch to keep it unchanged.
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]

View File

@ -1782,7 +1782,7 @@ class KolorsControlNetInpaintPipeline(
) )
if guess_mode and self.do_classifier_free_guidance: if guess_mode and self.do_classifier_free_guidance:
# Infered ControlNet only for the conditional batch. # Inferred ControlNet only for the conditional batch.
# To apply the output of ControlNet to both the unconditional and conditional batches, # To apply the output of ControlNet to both the unconditional and conditional batches,
# add 0 to the unconditional batch to keep it unchanged. # add 0 to the unconditional batch to keep it unchanged.
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]

View File

@ -559,7 +559,7 @@ class FabricPipeline(DiffusionPipeline):
End point for providing feedback (between 0 and 1). End point for providing feedback (between 0 and 1).
min_weight (`float`, *optional*, defaults to `.05`): min_weight (`float`, *optional*, defaults to `.05`):
Minimum weight for feedback. Minimum weight for feedback.
max_weight (`float`, *optional*, defults tp `1.0`): max_weight (`float`, *optional*, defaults tp `1.0`):
Maximum weight for feedback. Maximum weight for feedback.
neg_scale (`float`, *optional*, defaults to `.5`): neg_scale (`float`, *optional*, defaults to `.5`):
Scale factor for negative feedback. Scale factor for negative feedback.

View File

@ -118,7 +118,7 @@ EXAMPLE_DOC_STRING = """
>>> # Here we need use pipeline internal unet model >>> # Here we need use pipeline internal unet model
>>> pipe.unet = pipe.unet_model.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True) >>> pipe.unet = pipe.unet_model.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True)
>>> >>>
>>> # Load aditional layers to the model >>> # Load additional layers to the model
>>> pipe.unet.load_additional_layers(weight_path="proc_data/faithdiff/FaithDiff.bin", dtype=dtype) >>> pipe.unet.load_additional_layers(weight_path="proc_data/faithdiff/FaithDiff.bin", dtype=dtype)
>>> >>>
>>> # Enable vae tiling >>> # Enable vae tiling

View File

@ -72,7 +72,7 @@ class GaussianSmoothing(nn.Module):
""" """
Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py
Apply gaussian smoothing on a Apply gaussian smoothing on a
1d, 2d or 3d tensor. Filtering is performed seperately for each channel 1d, 2d or 3d tensor. Filtering is performed separately for each channel
in the input using a depthwise convolution. in the input using a depthwise convolution.
Arguments: Arguments:
channels (int, sequence): Number of channels of the input tensors. Output will channels (int, sequence): Number of channels of the input tensors. Output will

View File

@ -1509,7 +1509,7 @@ class StableDiffusionXL_AE_Pipeline(
add_time_ids = add_time_ids.repeat(batch_size, 1).to(DEVICE) add_time_ids = add_time_ids.repeat(batch_size, 1).to(DEVICE)
# interative sampling # interactive sampling
self.scheduler.set_timesteps(num_inference_steps) self.scheduler.set_timesteps(num_inference_steps)
latents_list = [latents] latents_list = [latents]
pred_x0_list = [] pred_x0_list = []
@ -1548,7 +1548,7 @@ class StableDiffusionXL_AE_Pipeline(
x: torch.FloatTensor, x: torch.FloatTensor,
): ):
""" """
predict the sampe the next step in the denoise process. predict the sample the next step in the denoise process.
""" """
ref_noise = model_output[:1, :, :, :].expand(model_output.shape) ref_noise = model_output[:1, :, :, :].expand(model_output.shape)
alpha_prod_t = self.scheduler.alphas_cumprod[timestep] alpha_prod_t = self.scheduler.alphas_cumprod[timestep]

View File

@ -132,7 +132,7 @@ def _preprocess_adapter_image(image, height, width):
image = torch.cat(image, dim=0) image = torch.cat(image, dim=0)
else: else:
raise ValueError( raise ValueError(
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}" f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
) )
return image return image

View File

@ -150,7 +150,7 @@ def _preprocess_adapter_image(image, height, width):
image = torch.cat(image, dim=0) image = torch.cat(image, dim=0)
else: else:
raise ValueError( raise ValueError(
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}" f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
) )
return image return image

View File

@ -220,7 +220,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
revers = True revers = True
def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None): def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None):
if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps if "PRO" in mode: # in Prompt mode, make masks from sum of attention maps
self.step = step self.step = step
if len(self.attnmaps_sizes) > 3: if len(self.attnmaps_sizes) > 3:
@ -552,9 +552,9 @@ def get_attn_maps(self, attn):
def reset_attnmaps(self): # init parameters in every batch def reset_attnmaps(self): # init parameters in every batch
self.step = 0 self.step = 0
self.attnmaps = {} # maked from attention maps self.attnmaps = {} # made from attention maps
self.attnmaps_sizes = [] # height,width set of u-net blocks self.attnmaps_sizes = [] # height,width set of u-net blocks
self.attnmasks = {} # maked from attnmaps for regions self.attnmasks = {} # made from attnmaps for regions
self.maskready = False self.maskready = False
self.history = {} self.history = {}

View File

@ -97,7 +97,7 @@ class SdeDragPipeline(DiffusionPipeline):
steps (`int`, *optional*, defaults to 200): steps (`int`, *optional*, defaults to 200):
The number of sampling iterations. The number of sampling iterations.
step_size (`int`, *optional*, defaults to 2): step_size (`int`, *optional*, defaults to 2):
The drag diatance of each drag step. The drag distance of each drag step.
image_scale (`float`, *optional*, defaults to 0.3): image_scale (`float`, *optional*, defaults to 0.3):
To avoid duplicating the content, use image_scale to perturbs the source. To avoid duplicating the content, use image_scale to perturbs the source.
adapt_radius (`int`, *optional*, defaults to 5): adapt_radius (`int`, *optional*, defaults to 5):

View File

@ -284,7 +284,7 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline):
) )
else: else:
raise AssertionError( raise AssertionError(
f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} respectively"
) )
original_image_embeddings = self._encode_image( original_image_embeddings = self._encode_image(

View File

@ -1012,7 +1012,7 @@ def main(args):
unet = get_peft_model(unet, lora_config) unet = get_peft_model(unet, lora_config)
# 9. Handle mixed precision and device placement # 9. Handle mixed precision and device placement
# For mixed precision training we cast all non-trainable weigths to half-precision # For mixed precision training we cast all non-trainable weights to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -829,7 +829,7 @@ def main(args):
) )
# 8. Handle mixed precision and device placement # 8. Handle mixed precision and device placement
# For mixed precision training we cast all non-trainable weigths to half-precision # For mixed precision training we cast all non-trainable weights to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -1026,7 +1026,7 @@ def main(args):
unet = get_peft_model(unet, lora_config) unet = get_peft_model(unet, lora_config)
# 9. Handle mixed precision and device placement # 9. Handle mixed precision and device placement
# For mixed precision training we cast all non-trainable weigths to half-precision # For mixed precision training we cast all non-trainable weights to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -962,7 +962,7 @@ def main(args):
) )
# 9. Handle mixed precision and device placement # 9. Handle mixed precision and device placement
# For mixed precision training we cast all non-trainable weigths to half-precision # For mixed precision training we cast all non-trainable weights to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -1021,7 +1021,7 @@ def main(args):
) )
# 9. Handle mixed precision and device placement # 9. Handle mixed precision and device placement
# For mixed precision training we cast all non-trainable weigths to half-precision # For mixed precision training we cast all non-trainable weights to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -411,7 +411,7 @@ export CAPTION_COLUMN='caption_column'
export CACHE_DIR="/data/train_csr/.cache/huggingface/" export CACHE_DIR="/data/train_csr/.cache/huggingface/"
export OUTPUT_DIR='/data/train_csr/FLUX/MODEL_OUT/'$MODEL_TYPE export OUTPUT_DIR='/data/train_csr/FLUX/MODEL_OUT/'$MODEL_TYPE
# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using acclerate would cause problems.) # The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using accelerate would cause problems.)
CUDA_VISIBLE_DEVICES=0 python3 train_controlnet_flux.py \ CUDA_VISIBLE_DEVICES=0 python3 train_controlnet_flux.py \

View File

@ -173,13 +173,13 @@ accelerate launch train_dreambooth_lora_flux.py \
### Target Modules ### Target Modules
When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them. When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them.
More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma separated string
the exact modules for LoRA training. Here are some examples of target modules you can provide: the exact modules for LoRA training. Here are some examples of target modules you can provide:
- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"` - for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"` - to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"` - to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
> [!NOTE] > [!NOTE]
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string: > `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma separated string:
> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k` > **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k` > **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k`
> [!NOTE] > [!NOTE]

View File

@ -107,7 +107,7 @@ To better track our training experiments, we're using the following flags in the
Additionally, we welcome you to explore the following CLI arguments: Additionally, we welcome you to explore the following CLI arguments:
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only. * `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
* `--rank`: The rank of the LoRA layers. The higher the rank, the more parameters are trained. The default is 16. * `--rank`: The rank of the LoRA layers. The higher the rank, the more parameters are trained. The default is 16.
We provide several options for optimizing memory optimization: We provide several options for optimizing memory optimization:

View File

@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
Additionally, we welcome you to explore the following CLI arguments: Additionally, we welcome you to explore the following CLI arguments:
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only. * `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
* `--system_prompt`: A custom system prompt to provide additional personality to the model. * `--system_prompt`: A custom system prompt to provide additional personality to the model.
* `--max_sequence_length`: Maximum sequence length to use for text embeddings. * `--max_sequence_length`: Maximum sequence length to use for text embeddings.

View File

@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
Additionally, we welcome you to explore the following CLI arguments: Additionally, we welcome you to explore the following CLI arguments:
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only. * `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
* `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55). * `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55).
* `--max_sequence_length`: Maximum sequence length to use for text embeddings. * `--max_sequence_length`: Maximum sequence length to use for text embeddings.

View File

@ -567,7 +567,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only' 'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
), ),
) )

View File

@ -596,7 +596,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only' 'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
), ),
) )

View File

@ -514,7 +514,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only' 'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
), ),
) )

View File

@ -513,7 +513,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only' 'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
), ),
) )

View File

@ -576,7 +576,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
"The transformer block layers to apply LoRA training on. Please specify the layers in a comma seperated string." "The transformer block layers to apply LoRA training on. Please specify the layers in a comma separated string."
"For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md" "For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md"
), ),
) )
@ -585,7 +585,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
"The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma seperated manner." "The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma separated manner."
'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md' 'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md'
), ),
) )

View File

@ -664,7 +664,7 @@ def parse_args(input_args=None):
action="store_true", action="store_true",
default=False, default=False,
help=( help=(
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. " "Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`" "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
), ),
) )

View File

@ -329,7 +329,7 @@ def parse_args(input_args=None):
type=str, type=str,
default=None, default=None,
help=( help=(
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only' 'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
), ),
) )
parser.add_argument( parser.add_argument(

View File

@ -400,7 +400,7 @@ def main():
image_encoder.requires_grad_(False) image_encoder.requires_grad_(False)
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -1147,7 +1147,7 @@ def main(args):
tracker_config = dict(vars(args)) tracker_config = dict(vars(args))
accelerator.init_trackers(args.tracker_project_name, config=tracker_config) accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
# Function for unwraping if torch.compile() was used in accelerate. # Function for unwrapping if torch.compile() was used in accelerate.
def unwrap_model(model): def unwrap_model(model):
model = accelerator.unwrap_model(model) model = accelerator.unwrap_model(model)
model = model._orig_mod if is_compiled_module(model) else model model = model._orig_mod if is_compiled_module(model) else model

View File

@ -69,7 +69,7 @@ accelerate launch --config_file=accelerate.yaml \
--seed="0" --seed="0"
``` ```
We can direcly pass a quantized checkpoint path, too: We can directly pass a quantized checkpoint path, too:
```diff ```diff
+ --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg" + --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg"

View File

@ -13,7 +13,7 @@ args = parser.parse_args()
device = "cpu" device = "cpu"
prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings" prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings"
model_id = "path-to-your-trained-model" model_id = "path-to-your-trained-model"
pipe = StableDiffusionPipeline.from_pretrained(model_id) pipe = StableDiffusionPipeline.from_pretrained(model_id)

View File

@ -80,7 +80,7 @@ export INT8_MODEL_NAME="./int8_model"
python text2images.py \ python text2images.py \
--pretrained_model_name_or_path=$INT8_MODEL_NAME \ --pretrained_model_name_or_path=$INT8_MODEL_NAME \
--caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \ --caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings." \
--images_num 4 --images_num 4
``` ```

View File

@ -664,7 +664,7 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
# &amp # &amp
caption = re.sub(r"&amp", "", caption) caption = re.sub(r"&amp", "", caption)
# ip adresses: # ip addresses:
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
# article ids: # article ids:

View File

@ -612,7 +612,7 @@ def main():
# See Section 3.1. of the paper. # See Section 3.1. of the paper.
max_length = 120 max_length = 120
# For mixed precision training we cast all non-trainable weigths (vae, text_encoder) to half-precision # For mixed precision training we cast all non-trainable weights (vae, text_encoder) to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -120,11 +120,11 @@ if __name__ == "__main__":
parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev") parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev")
parser.add_argument("--width", type=int, default=1024, help="width of the image to generate") parser.add_argument("--width", type=int, default=1024, help="width of the image to generate")
parser.add_argument("--height", type=int, default=1024, help="height of the image to generate") parser.add_argument("--height", type=int, default=1024, help="height of the image to generate")
parser.add_argument("--guidance", type=float, default=3.5, help="gauidance strentgh for dev") parser.add_argument("--guidance", type=float, default=3.5, help="guidance strength for dev")
parser.add_argument("--seed", type=int, default=None, help="seed for inference") parser.add_argument("--seed", type=int, default=None, help="seed for inference")
parser.add_argument("--profile", action="store_true", help="enable profiling") parser.add_argument("--profile", action="store_true", help="enable profiling")
parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.") parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.")
parser.add_argument("--itters", type=int, default=15, help="tiems to run inference and get avg time in sec.") parser.add_argument("--itters", type=int, default=15, help="items to run inference and get avg time in sec.")
args = parser.parse_args() args = parser.parse_args()
if args.schnell: if args.schnell:
ckpt_id = "black-forest-labs/FLUX.1-schnell" ckpt_id = "black-forest-labs/FLUX.1-schnell"

View File

@ -759,7 +759,7 @@ def main(args):
unet, text_encoder, optimizer, train_dataloader unet, text_encoder, optimizer, train_dataloader
) )
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -661,7 +661,7 @@ def parse_args(input_args=None):
action="store_true", action="store_true",
default=False, default=False,
help=( help=(
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. " "Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`" "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
), ),
) )

View File

@ -789,7 +789,7 @@ def main():
text_encoder, optimizer, train_dataloader, lr_scheduler text_encoder, optimizer, train_dataloader, lr_scheduler
) )
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -814,7 +814,7 @@ def main():
text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler
) )
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
# as these weights are only used for inference, keeping weights in full precision is not required. # as these weights are only used for inference, keeping weights in full precision is not required.
weight_dtype = torch.float32 weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16": if accelerator.mixed_precision == "fp16":

View File

@ -220,7 +220,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(
f"double_blocks.{i}.txt_attn.proj.bias" f"double_blocks.{i}.txt_attn.proj.bias"
) )
# single transfomer blocks # single transformer blocks
for i in range(num_single_layers): for i in range(num_single_layers):
block_prefix = f"single_transformer_blocks.{i}." block_prefix = f"single_transformer_blocks.{i}."
# norm.linear <- single_blocks.0.modulation.lin # norm.linear <- single_blocks.0.modulation.lin

View File

@ -394,7 +394,7 @@ if __name__ == "__main__":
help="Scheduler type to use. Use 'scm' for Sana Sprint models.", help="Scheduler type to use. Use 'scm' for Sana Sprint models.",
) )
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.") parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipelien elemets in one.") parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipeline elements in one.")
parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.") parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
args = parser.parse_args() args = parser.parse_args()

View File

@ -984,7 +984,7 @@ def renderer(*, args, checkpoint_map_location):
return renderer_model return renderer_model
# prior model will expect clip_mean and clip_std, whic are missing from the state_dict # prior model will expect clip_mean and clip_std, which are missing from the state_dict
PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"] PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]

View File

@ -55,8 +55,8 @@ for key in orig_state_dict.keys():
state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
else: else:
state_dict[key] = orig_state_dict[key] state_dict[key] = orig_state_dict[key]
deocder = WuerstchenDiffNeXt() decoder = WuerstchenDiffNeXt()
deocder.load_state_dict(state_dict) decoder.load_state_dict(state_dict)
# Prior # Prior
orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"] orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"]
@ -94,7 +94,7 @@ prior_pipeline = WuerstchenPriorPipeline(
prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior") prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
decoder_pipeline = WuerstchenDecoderPipeline( decoder_pipeline = WuerstchenDecoderPipeline(
text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=deocder, scheduler=scheduler text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=decoder, scheduler=scheduler
) )
decoder_pipeline.save_pretrained("warp-ai/wuerstchen") decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
@ -103,7 +103,7 @@ wuerstchen_pipeline = WuerstchenCombinedPipeline(
# Decoder # Decoder
text_encoder=gen_text_encoder, text_encoder=gen_text_encoder,
tokenizer=gen_tokenizer, tokenizer=gen_tokenizer,
decoder=deocder, decoder=decoder,
scheduler=scheduler, scheduler=scheduler,
vqgan=vqmodel, vqgan=vqmodel,
# Prior # Prior

View File

@ -243,7 +243,7 @@ class GroupOffloadingHook(ModelHook):
class LazyPrefetchGroupOffloadingHook(ModelHook): class LazyPrefetchGroupOffloadingHook(ModelHook):
r""" r"""
A hook, used in conjuction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module. A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
prefetching groups in the correct order. prefetching groups in the correct order.

View File

@ -90,7 +90,7 @@ class PeftInputAutocastDisableHook(ModelHook):
that the inputs are casted to the computation dtype correctly always. However, there are two goals we are that the inputs are casted to the computation dtype correctly always. However, there are two goals we are
hoping to achieve: hoping to achieve:
1. Making forward implementations independent of device/dtype casting operations as much as possible. 1. Making forward implementations independent of device/dtype casting operations as much as possible.
2. Peforming inference without losing information from casting to different precisions. With the current 2. Performing inference without losing information from casting to different precisions. With the current
PEFT implementation (as linked in the reference above), and assuming running layerwise casting inference PEFT implementation (as linked in the reference above), and assuming running layerwise casting inference
with storage_dtype=torch.float8_e4m3fn and compute_dtype=torch.bfloat16, inputs are cast to with storage_dtype=torch.float8_e4m3fn and compute_dtype=torch.bfloat16, inputs are cast to
torch.float8_e4m3fn in the lora layer. We will then upcast back to torch.bfloat16 when we continue the torch.float8_e4m3fn in the lora layer. We will then upcast back to torch.bfloat16 when we continue the

View File

@ -819,7 +819,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
if zero_status_pe: if zero_status_pe:
logger.info( logger.info(
"The `position_embedding` LoRA params are all zeros which make them ineffective. " "The `position_embedding` LoRA params are all zeros which make them ineffective. "
"So, we will purge them out of the curret state dict to make loading possible." "So, we will purge them out of the current state dict to make loading possible."
) )
else: else:
@ -835,7 +835,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
if zero_status_t5: if zero_status_t5:
logger.info( logger.info(
"The `t5xxl` LoRA params are all zeros which make them ineffective. " "The `t5xxl` LoRA params are all zeros which make them ineffective. "
"So, we will purge them out of the curret state dict to make loading possible." "So, we will purge them out of the current state dict to make loading possible."
) )
else: else:
logger.info( logger.info(
@ -850,7 +850,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
if zero_status_diff_b: if zero_status_diff_b:
logger.info( logger.info(
"The `diff_b` LoRA params are all zeros which make them ineffective. " "The `diff_b` LoRA params are all zeros which make them ineffective. "
"So, we will purge them out of the curret state dict to make loading possible." "So, we will purge them out of the current state dict to make loading possible."
) )
else: else:
logger.info( logger.info(
@ -866,7 +866,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
if zero_status_diff: if zero_status_diff:
logger.info( logger.info(
"The `diff` LoRA params are all zeros which make them ineffective. " "The `diff` LoRA params are all zeros which make them ineffective. "
"So, we will purge them out of the curret state dict to make loading possible." "So, we will purge them out of the current state dict to make loading possible."
) )
else: else:
logger.info( logger.info(
@ -1237,7 +1237,7 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
f"double_blocks.{i}.txt_attn.norm.key_norm.scale" f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
) )
# single transfomer blocks # single transformer blocks
for i in range(num_single_layers): for i in range(num_single_layers):
block_prefix = f"single_transformer_blocks.{i}." block_prefix = f"single_transformer_blocks.{i}."

View File

@ -2413,7 +2413,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
) -> bool: ) -> bool:
""" """
Control LoRA expands the shape of the input layer from (3072, 64) to (3072, 128). This method handles that and Control LoRA expands the shape of the input layer from (3072, 64) to (3072, 128). This method handles that and
generalizes things a bit so that any parameter that needs expansion receives appropriate treatement. generalizes things a bit so that any parameter that needs expansion receives appropriate treatment.
""" """
state_dict = {} state_dict = {}
if lora_state_dict is not None: if lora_state_dict is not None:

View File

@ -330,7 +330,7 @@ class PeftAdapterMixin:
new_sd[k] = v new_sd[k] = v
return new_sd return new_sd
# To handle scenarios where we cannot successfully set state dict. If it's unsucessful, # To handle scenarios where we cannot successfully set state dict. If it's unsuccessful,
# we should also delete the `peft_config` associated to the `adapter_name`. # we should also delete the `peft_config` associated to the `adapter_name`.
try: try:
if hotswap: if hotswap:
@ -344,7 +344,7 @@ class PeftAdapterMixin:
config=lora_config, config=lora_config,
) )
except Exception as e: except Exception as e:
logger.error(f"Hotswapping {adapter_name} was unsucessful with the following error: \n{e}") logger.error(f"Hotswapping {adapter_name} was unsuccessful with the following error: \n{e}")
raise raise
# the hotswap function raises if there are incompatible keys, so if we reach this point we can set # the hotswap function raises if there are incompatible keys, so if we reach this point we can set
# it to None # it to None
@ -379,7 +379,7 @@ class PeftAdapterMixin:
module.delete_adapter(adapter_name) module.delete_adapter(adapter_name)
self.peft_config.pop(adapter_name) self.peft_config.pop(adapter_name)
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}") logger.error(f"Loading {adapter_name} was unsuccessful with the following error: \n{e}")
raise raise
warn_msg = "" warn_msg = ""
@ -712,7 +712,7 @@ class PeftAdapterMixin:
if self.lora_scale != 1.0: if self.lora_scale != 1.0:
module.scale_layer(self.lora_scale) module.scale_layer(self.lora_scale)
# For BC with prevous PEFT versions, we need to check the signature # For BC with previous PEFT versions, we need to check the signature
# of the `merge` method to see if it supports the `adapter_names` argument. # of the `merge` method to see if it supports the `adapter_names` argument.
supported_merge_kwargs = list(inspect.signature(module.merge).parameters) supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
if "adapter_names" in supported_merge_kwargs: if "adapter_names" in supported_merge_kwargs:

View File

@ -453,7 +453,7 @@ class FromSingleFileMixin:
logger.warning( logger.warning(
"Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n" "Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n"
"This may lead to errors if the model components are not correctly inferred. \n" "This may lead to errors if the model components are not correctly inferred. \n"
"To avoid this warning, please explicity pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n" "To avoid this warning, please explicitly pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
"e.g. `from_single_file(<my model checkpoint path>, config=<path to local diffusers model repo>) \n" "e.g. `from_single_file(<my model checkpoint path>, config=<path to local diffusers model repo>) \n"
"or run `from_single_file` with `local_files_only=False` first to update the local cache directory with " "or run `from_single_file` with `local_files_only=False` first to update the local cache directory with "
"the necessary config files.\n" "the necessary config files.\n"

View File

@ -2278,7 +2278,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
f"double_blocks.{i}.txt_attn.proj.bias" f"double_blocks.{i}.txt_attn.proj.bias"
) )
# single transfomer blocks # single transformer blocks
for i in range(num_single_layers): for i in range(num_single_layers):
block_prefix = f"single_transformer_blocks.{i}." block_prefix = f"single_transformer_blocks.{i}."
# norm.linear <- single_blocks.0.modulation.lin # norm.linear <- single_blocks.0.modulation.lin
@ -2872,7 +2872,7 @@ def convert_auraflow_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
def convert_lumina2_to_diffusers(checkpoint, **kwargs): def convert_lumina2_to_diffusers(checkpoint, **kwargs):
converted_state_dict = {} converted_state_dict = {}
# Original Lumina-Image-2 has an extra norm paramter that is unused # Original Lumina-Image-2 has an extra norm parameter that is unused
# We just remove it here # We just remove it here
checkpoint.pop("norm_final.weight", None) checkpoint.pop("norm_final.weight", None)

View File

@ -123,7 +123,7 @@ class SD3Transformer2DLoadersMixin:
key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj") key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
updated_state_dict[key] = value updated_state_dict[key] = value
# Image projetion parameters # Image projection parameters
embed_dim = updated_state_dict["proj_in.weight"].shape[1] embed_dim = updated_state_dict["proj_in.weight"].shape[1]
output_dim = updated_state_dict["proj_out.weight"].shape[0] output_dim = updated_state_dict["proj_out.weight"].shape[0]
hidden_dim = updated_state_dict["proj_in.weight"].shape[0] hidden_dim = updated_state_dict["proj_in.weight"].shape[0]

View File

@ -734,17 +734,17 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
unet (`UNet2DConditionModel`): unet (`UNet2DConditionModel`):
The UNet model we want to control. The UNet model we want to control.
controlnet (`ControlNetXSAdapter`): controlnet (`ControlNetXSAdapter`):
The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS The ControlNet-XS adapter with which the UNet will be fused. If none is given, a new ControlNet-XS
adapter will be created. adapter will be created.
size_ratio (float, *optional*, defaults to `None`): size_ratio (float, *optional*, defaults to `None`):
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details. Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`): ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details, Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
where this parameter is called `block_out_channels`. where this parameter is called `block_out_channels`.
time_embedding_mix (`float`, *optional*, defaults to None): time_embedding_mix (`float`, *optional*, defaults to None):
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details. Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`): ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
Passed to the `init` of the new controlent if no controlent was given. Passed to the `init` of the new controlnet if no controlnet was given.
""" """
if controlnet is None: if controlnet is None:
controlnet = ControlNetXSAdapter.from_unet( controlnet = ControlNetXSAdapter.from_unet(

View File

@ -97,7 +97,7 @@ def get_3d_sincos_pos_embed(
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
spatial dimensions (height and width). spatial dimensions (height and width).
temporal_size (`int`): temporal_size (`int`):
The temporal dimension of postional embeddings (number of frames). The temporal dimension of positional embeddings (number of frames).
spatial_interpolation_scale (`float`, defaults to 1.0): spatial_interpolation_scale (`float`, defaults to 1.0):
Scale factor for spatial grid interpolation. Scale factor for spatial grid interpolation.
temporal_interpolation_scale (`float`, defaults to 1.0): temporal_interpolation_scale (`float`, defaults to 1.0):
@ -169,7 +169,7 @@ def _get_3d_sincos_pos_embed_np(
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
spatial dimensions (height and width). spatial dimensions (height and width).
temporal_size (`int`): temporal_size (`int`):
The temporal dimension of postional embeddings (number of frames). The temporal dimension of positional embeddings (number of frames).
spatial_interpolation_scale (`float`, defaults to 1.0): spatial_interpolation_scale (`float`, defaults to 1.0):
Scale factor for spatial grid interpolation. Scale factor for spatial grid interpolation.
temporal_interpolation_scale (`float`, defaults to 1.0): temporal_interpolation_scale (`float`, defaults to 1.0):

View File

@ -30,7 +30,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
_supports_gradient_checkpointing = True _supports_gradient_checkpointing = True
""" """
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, offical code: A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, official code:
https://github.com/Vchitect/Latte https://github.com/Vchitect/Latte
Parameters: Parameters:
@ -216,7 +216,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
) )
num_patches = height * width num_patches = height * width
hidden_states = self.pos_embed(hidden_states) # alrady add positional embeddings hidden_states = self.pos_embed(hidden_states) # already add positional embeddings
added_cond_kwargs = {"resolution": None, "aspect_ratio": None} added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
timestep, embedded_timestep = self.adaln_single( timestep, embedded_timestep = self.adaln_single(

View File

@ -43,7 +43,7 @@ class LuminaNextDiTBlock(nn.Module):
num_kv_heads (`int`): num_kv_heads (`int`):
Number of attention heads in key and value features (if using GQA), or set to None for the same as query. Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
multiple_of (`int`): The number of multiple of ffn layer. multiple_of (`int`): The number of multiple of ffn layer.
ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension. ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
norm_eps (`float`): The eps for norm layer. norm_eps (`float`): The eps for norm layer.
qk_norm (`bool`): normalization for query and key. qk_norm (`bool`): normalization for query and key.
cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states. cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.

View File

@ -154,7 +154,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
# of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
# is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below. # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
# This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
# without running proper depcrecation cycles for the {down,mid,up} blocks which are a # without running proper deprecation cycles for the {down,mid,up} blocks which are a
# part of the public API. # part of the public API.
num_attention_heads = attention_head_dim num_attention_heads = attention_head_dim

View File

@ -131,7 +131,7 @@ class AmusedPipeline(DiffusionPipeline):
generation deterministic. generation deterministic.
latents (`torch.IntTensor`, *optional*): latents (`torch.IntTensor`, *optional*):
Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
gneration. If not provided, the starting latents will be completely masked. generation. If not provided, the starting latents will be completely masked.
prompt_embeds (`torch.Tensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. A single vector from the provided, text embeddings are generated from the `prompt` input argument. A single vector from the

View File

@ -373,7 +373,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
*e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
`negative_prompt` input argument. `negative_prompt` input argument.
generated_prompt_embeds (`torch.Tensor`, *optional*): generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
argument. argument.
negative_generated_prompt_embeds (`torch.Tensor`, *optional*): negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
@ -394,7 +394,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
attention_mask (`torch.LongTensor`): attention_mask (`torch.LongTensor`):
Attention mask to be applied to the `prompt_embeds`. Attention mask to be applied to the `prompt_embeds`.
generated_prompt_embeds (`torch.Tensor`): generated_prompt_embeds (`torch.Tensor`):
Text embeddings generated from the GPT2 langauge model. Text embeddings generated from the GPT2 language model.
Example: Example:
@ -904,7 +904,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
generated_prompt_embeds (`torch.Tensor`, *optional*): generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
argument. argument.
negative_generated_prompt_embeds (`torch.Tensor`, *optional*): negative_generated_prompt_embeds (`torch.Tensor`, *optional*):

View File

@ -138,7 +138,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
def get_query_embeddings(self, input_image, src_subject): def get_query_embeddings(self, input_image, src_subject):
return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False) return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
# from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it # from the original Blip Diffusion code, specifies the target subject and augments the prompt by repeating it
def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20): def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
rv = [] rv = []
for prompt, tgt_subject in zip(prompts, tgt_subjects): for prompt, tgt_subject in zip(prompts, tgt_subjects):

View File

@ -149,7 +149,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
def get_query_embeddings(self, input_image, src_subject): def get_query_embeddings(self, input_image, src_subject):
return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False) return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
# from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it # from the original Blip Diffusion code, specifies the target subject and augments the prompt by repeating it
def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20): def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
rv = [] rv = []
for prompt, tgt_subject in zip(prompts, tgt_subjects): for prompt, tgt_subject in zip(prompts, tgt_subjects):

View File

@ -739,7 +739,7 @@ class StableDiffusionControlNetXSPipeline(
callback_on_step_end_tensor_inputs (`List`, *optional*): callback_on_step_end_tensor_inputs (`List`, *optional*):
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeine class. `._callback_tensor_inputs` attribute of your pipeline class.
Examples: Examples:
Returns: Returns:

View File

@ -880,7 +880,7 @@ class StableDiffusionXLControlNetXSPipeline(
callback_on_step_end_tensor_inputs (`List`, *optional*): callback_on_step_end_tensor_inputs (`List`, *optional*):
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeine class. `._callback_tensor_inputs` attribute of your pipeline class.
Examples: Examples:

View File

@ -97,7 +97,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
for i, audio in enumerate(audios): for i, audio in enumerate(audios):
write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose()) write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
# To dislay in google colab # To display in google colab
import IPython.display as ipd import IPython.display as ipd
for audio in audios: for audio in audios:

View File

@ -509,7 +509,8 @@ class StableDiffusionModelEditingPipeline(
The destination prompt. Must contain all words from `source_prompt` with additional ones to specify the The destination prompt. Must contain all words from `source_prompt` with additional ones to specify the
target edit. target edit.
lamb (`float`, *optional*, defaults to 0.1): lamb (`float`, *optional*, defaults to 0.1):
The lambda parameter specifying the regularization intesity. Smaller values increase the editing power. The lambda parameter specifying the regularization intensity. Smaller values increase the editing
power.
restart_params (`bool`, *optional*, defaults to True): restart_params (`bool`, *optional*, defaults to True):
Restart the model parameters to their pre-trained version before editing. This is done to avoid edit Restart the model parameters to their pre-trained version before editing. This is done to avoid edit
compounding. When it is `False`, edits accumulate. compounding. When it is `False`, edits accumulate.

View File

@ -1097,7 +1097,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
added_cond_kwargs: (`dict`, *optional*): added_cond_kwargs: (`dict`, *optional*):
A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
are passed along to the UNet blocks. are passed along to the UNet blocks.
down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*): down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
additional residuals to be added to UNet long skip connections from down blocks to up blocks for additional residuals to be added to UNet long skip connections from down blocks to up blocks for

View File

@ -478,7 +478,7 @@ class AnimateDiffFreeNoiseMixin:
Must be one of ["shuffle_context", "repeat_context", "random"]. Must be one of ["shuffle_context", "repeat_context", "random"].
- "shuffle_context" - "shuffle_context"
Shuffles a fixed batch of `context_length` latents to create a final latent of size Shuffles a fixed batch of `context_length` latents to create a final latent of size
`num_frames`. This is usually the best setting for most generation scenarious. However, there `num_frames`. This is usually the best setting for most generation scenarios. However, there
might be visible repetition noticeable in the kinds of motion/animation generated. might be visible repetition noticeable in the kinds of motion/animation generated.
- "repeated_context" - "repeated_context"
Repeats a fixed batch of `context_length` latents to create a final latent of size Repeats a fixed batch of `context_length` latents to create a final latent of size

View File

@ -462,7 +462,7 @@ class I2VGenXLPipeline(
image_latents = image_latents.unsqueeze(2) image_latents = image_latents.unsqueeze(2)
# Append a position mask for each subsequent frame # Append a position mask for each subsequent frame
# after the intial image latent frame # after the initial image latent frame
frame_position_mask = [] frame_position_mask = []
for frame_idx in range(num_frames - 1): for frame_idx in range(num_frames - 1):
scale = (frame_idx + 1) / (num_frames - 1) scale = (frame_idx + 1) / (num_frames - 1)

View File

@ -496,7 +496,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
"As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. " "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
"This way, Kandinsky's masking behavior is aligned with Stable Diffusion. " "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
"THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. " "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
"This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0" "This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
) )
self._warn_has_been_called = True self._warn_has_been_called = True

View File

@ -386,7 +386,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
"As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. " "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
"This way, Kandinsky's masking behavior is aligned with Stable Diffusion. " "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
"THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. " "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
"This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0" "This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
) )
self._warn_has_been_called = True self._warn_has_been_called = True

View File

@ -668,7 +668,7 @@ class Embedding(torch.nn.Module):
# Embeddings. # Embeddings.
words_embeddings = self.word_embeddings(input_ids) words_embeddings = self.word_embeddings(input_ids)
embeddings = words_embeddings embeddings = words_embeddings
# Data format change to avoid explicit tranposes : [b s h] --> [s b h]. # Data format change to avoid explicit transposes : [b s h] --> [s b h].
embeddings = embeddings.transpose(0, 1).contiguous() embeddings = embeddings.transpose(0, 1).contiguous()
# If the input flag for fp32 residual connection is set, convert for float. # If the input flag for fp32 residual connection is set, convert for float.
if self.fp32_residual_connection: if self.fp32_residual_connection:

View File

@ -1458,7 +1458,7 @@ def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, e
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
# modifed so that updated xtm1 is returned as well (to avoid error accumulation) # modified so that updated xtm1 is returned as well (to avoid error accumulation)
mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
if variance > 0.0: if variance > 0.0:
noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta) noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)

View File

@ -1742,7 +1742,7 @@ def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, e
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
# modifed so that updated xtm1 is returned as well (to avoid error accumulation) # modified so that updated xtm1 is returned as well (to avoid error accumulation)
mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
if variance > 0.0: if variance > 0.0:
noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta) noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)

View File

@ -426,7 +426,7 @@ class MarigoldImageProcessor(ConfigMixin):
if isinstance(img, np.ndarray): if isinstance(img, np.ndarray):
img = torch.from_numpy(img) img = torch.from_numpy(img)
if not torch.is_floating_point(img): if not torch.is_floating_point(img):
raise ValueError(f"{prefix}: unexected dtype={img.dtype}.") raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
else: else:
raise ValueError(f"{prefix}: unexpected type={type(img)}.") raise ValueError(f"{prefix}: unexpected type={type(img)}.")
if val_min != 0.0 or val_max != 1.0: if val_min != 0.0 or val_max != 1.0:
@ -464,7 +464,7 @@ class MarigoldImageProcessor(ConfigMixin):
if torch.is_tensor(img): if torch.is_tensor(img):
img = img.cpu().numpy() img = img.cpu().numpy()
if not np.issubdtype(img.dtype, np.floating): if not np.issubdtype(img.dtype, np.floating):
raise ValueError(f"{prefix}: unexected dtype={img.dtype}.") raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
if val_min != 0.0 or val_max != 1.0: if val_min != 0.0 or val_max != 1.0:
img = (img - val_min) / (val_max - val_min) img = (img - val_min) / (val_max - val_min)
img = (img * (2**16 - 1)).astype(np.uint16) img = (img * (2**16 - 1)).astype(np.uint16)

View File

@ -176,7 +176,7 @@ class OmniGenPipeline(
get the continue embedding of input images by VAE get the continue embedding of input images by VAE
Args: Args:
input_pixel_values: normlized pixel of input images input_pixel_values: normalized pixel of input images
device: device:
Returns: torch.Tensor Returns: torch.Tensor
""" """

View File

@ -115,7 +115,7 @@ EXAMPLE_DOC_STRING = """
... with torch.no_grad(), torch.autocast("cuda"): ... with torch.no_grad(), torch.autocast("cuda"):
... depth_map = depth_estimator(image).predicted_depth ... depth_map = depth_estimator(image).predicted_depth
... depth_map = torch.nn.fuctional.interpolate( ... depth_map = torch.nn.functional.interpolate(
... depth_map.unsqueeze(1), ... depth_map.unsqueeze(1),
... size=(1024, 1024), ... size=(1024, 1024),
... mode="bicubic", ... mode="bicubic",

View File

@ -1038,7 +1038,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
textures = _convert_srgb_to_linear(textures) textures = _convert_srgb_to_linear(textures)
textures = textures.float() textures = textures.float()
# 3.3 augument the mesh with texture data # 3.3 augment the mesh with texture data
assert len(textures.shape) == 3 and textures.shape[-1] == len(texture_channels), ( assert len(textures.shape) == 3 and textures.shape[-1] == len(texture_channels), (
f"expected [meta_batch x inner_batch x texture_channels] field results, but got {textures.shape}" f"expected [meta_batch x inner_batch x texture_channels] field results, but got {textures.shape}"
) )

View File

@ -524,9 +524,9 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
latents = self.vqgan.config.scale_factor * latents latents = self.vqgan.config.scale_factor * latents
images = self.vqgan.decode(latents).sample.clamp(0, 1) images = self.vqgan.decode(latents).sample.clamp(0, 1)
if output_type == "np": if output_type == "np":
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesnt work images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
elif output_type == "pil": elif output_type == "pil":
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesnt work images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
images = self.numpy_to_pil(images) images = self.numpy_to_pil(images)
else: else:
images = latents images = latents

View File

@ -626,11 +626,11 @@ class StableCascadePriorPipeline(DiffusionPipeline):
self.maybe_free_model_hooks() self.maybe_free_model_hooks()
if output_type == "np": if output_type == "np":
latents = latents.cpu().float().numpy() # float() as bfloat16-> numpy doesnt work latents = latents.cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
prompt_embeds = prompt_embeds.cpu().float().numpy() # float() as bfloat16-> numpy doesnt work prompt_embeds = prompt_embeds.cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
negative_prompt_embeds = ( negative_prompt_embeds = (
negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None
) # float() as bfloat16-> numpy doesnt work ) # float() as bfloat16-> numpy doesn't work
if not return_dict: if not return_dict:
return ( return (

View File

@ -1047,7 +1047,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
class GaussianSmoothing(torch.nn.Module): class GaussianSmoothing(torch.nn.Module):
""" """
Arguments: Arguments:
Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed seperately for each channel in the input Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed separately for each channel in the input
using a depthwise convolution. using a depthwise convolution.
channels (int, sequence): Number of channels of the input tensors. Output will channels (int, sequence): Number of channels of the input tensors. Output will
have this number of channels as well. have this number of channels as well.

View File

@ -123,7 +123,7 @@ class StableDiffusionKDiffusionPipeline(
super().__init__() super().__init__()
logger.info( logger.info(
f"{self.__class__} is an experimntal pipeline and is likely to change in the future. We recommend to use" f"{self.__class__} is an experimental pipeline and is likely to change in the future. We recommend to use"
" this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines" " this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines"
" as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for" " as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for"
" production settings." " production settings."

Some files were not shown because too many files have changed in this diff Show More