Fix typos in docs and comments (#11416)
* Fix typos in docs and comments * Apply style fixes --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
d70f8ee18b
commit
86294d3c7f
@ -966,7 +966,7 @@ pipe.to("cuda")
|
||||
prompt = {
|
||||
0: "A caterpillar on a leaf, high quality, photorealistic",
|
||||
40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
|
||||
80: "A cocoon on a leaf, flowers in the backgrond, photorealistic",
|
||||
80: "A cocoon on a leaf, flowers in the background, photorealistic",
|
||||
120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
|
||||
160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
|
||||
200: "A beautiful butterfly, flying away in a forest, photorealistic",
|
||||
|
||||
@ -29,7 +29,7 @@ You can find additional information about LEDITS++ on the [project page](https:/
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
</Tip>
|
||||
|
||||
|
||||
@ -285,7 +285,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
|
||||
# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image(
|
||||
@ -368,7 +368,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
|
||||
# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image(
|
||||
|
||||
@ -485,7 +485,7 @@ image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1216))
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
@ -551,7 +551,7 @@ image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1024))
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
|
||||
@ -154,11 +154,11 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
You can enable PAG on an exisiting inpainting pipeline like this
|
||||
You can enable PAG on an existing inpainting pipeline like this
|
||||
|
||||
```py
|
||||
pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
|
||||
pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
|
||||
pipeline_inpaint = AutoPipelineForInpainting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
|
||||
pipeline = AutoPipelineForInpainting.from_pipe(pipeline_inpaint, enable_pag=True)
|
||||
```
|
||||
|
||||
This still works when your pipeline has a different task:
|
||||
|
||||
@ -125,7 +125,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
|
||||
```
|
||||
|
||||
You can also load a dataset straight from by specifying it's name in `dataset_name`.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
|
||||
|
||||
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
|
||||
- **pivotal tuning**
|
||||
@ -404,7 +404,7 @@ The advanced script now supports custom choice of U-net blocks to train during D
|
||||
> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
|
||||
|
||||
**Usage**
|
||||
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
|
||||
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma separated string specifying the targeted blocks.
|
||||
e.g:
|
||||
```bash
|
||||
--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
|
||||
|
||||
@ -141,7 +141,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
|
||||
```
|
||||
|
||||
You can also load a dataset straight from by specifying it's name in `dataset_name`.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
|
||||
|
||||
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
|
||||
- **pivotal tuning**
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
## Amused training
|
||||
|
||||
Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
|
||||
Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipes are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
|
||||
|
||||
All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).
|
||||
|
||||
|
||||
@ -201,7 +201,7 @@ Note that setting the `<ID_TOKEN>` is not necessary. From some limited experimen
|
||||
> - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
|
||||
> - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
|
||||
> - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
|
||||
> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
|
||||
> - When using the Prodigy optimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
|
||||
> - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
|
||||
>
|
||||
> Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
|
||||
|
||||
@ -879,7 +879,7 @@ def prepare_rotary_positional_embeddings(
|
||||
|
||||
|
||||
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
|
||||
# Use DeepSpeed optimzer
|
||||
# Use DeepSpeed optimizer
|
||||
if use_deepspeed:
|
||||
from accelerate.utils import DummyOptim
|
||||
|
||||
|
||||
@ -901,7 +901,7 @@ def prepare_rotary_positional_embeddings(
|
||||
|
||||
|
||||
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
|
||||
# Use DeepSpeed optimzer
|
||||
# Use DeepSpeed optimizer
|
||||
if use_deepspeed:
|
||||
from accelerate.utils import DummyOptim
|
||||
|
||||
|
||||
@ -4865,7 +4865,7 @@ python -m pip install intel_extension_for_pytorch
|
||||
```
|
||||
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
|
||||
```
|
||||
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
|
||||
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
|
||||
|
||||
```python
|
||||
pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
|
||||
|
||||
@ -336,13 +336,13 @@ if __name__ == "__main__":
|
||||
expanded_kernel_width = np.ceil(kernel_width) + 2
|
||||
|
||||
# Determine a set of field_of_view for each each output position, these are the pixels in the input image
|
||||
# that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the
|
||||
# that the pixel in the output image 'sees'. We get a matrix whose horizontal dim is the output pixels (big) and the
|
||||
# vertical dim is the pixels it 'sees' (kernel_size + 2)
|
||||
field_of_view = np.squeeze(
|
||||
np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
|
||||
)
|
||||
|
||||
# Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the
|
||||
# Assign weight to each pixel in the field of view. A matrix whose horizontal dim is the output pixels and the
|
||||
# vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
|
||||
# 'field_of_view')
|
||||
weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)
|
||||
|
||||
@ -201,16 +201,16 @@ class PAIntAAttnProcessor:
|
||||
# ================================================== #
|
||||
# We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
|
||||
# The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
|
||||
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
|
||||
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack instead.
|
||||
|
||||
# The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
|
||||
# The SelfAttention block receives the normalized latents from the BasicTransformerBlock,
|
||||
# But the residual of the output is the non-normalized version.
|
||||
# Therefore we unnormalize the input hidden state here
|
||||
unnormalized_input_hidden_states = (
|
||||
input_hidden_states + self.transformer_block.norm1.bias
|
||||
) * self.transformer_block.norm1.weight
|
||||
|
||||
# TODO: return if neccessary
|
||||
# TODO: return if necessary
|
||||
# if self.use_ada_layer_norm_zero:
|
||||
# attn_output = gate_msa.unsqueeze(1) * attn_output
|
||||
# elif self.use_ada_layer_norm_single:
|
||||
@ -220,7 +220,7 @@ class PAIntAAttnProcessor:
|
||||
if transformer_hidden_states.ndim == 4:
|
||||
transformer_hidden_states = transformer_hidden_states.squeeze(1)
|
||||
|
||||
# TODO: return if neccessary
|
||||
# TODO: return if necessary
|
||||
# 2.5 GLIGEN Control
|
||||
# if gligen_kwargs is not None:
|
||||
# transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
|
||||
@ -266,7 +266,7 @@ class PAIntAAttnProcessor:
|
||||
) = cross_attention_input_hidden_states.chunk(2)
|
||||
|
||||
# Same split for the encoder_hidden_states i.e. the tokens
|
||||
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
|
||||
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the beginning.
|
||||
_encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
|
||||
2
|
||||
)
|
||||
@ -896,7 +896,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
|
||||
class GaussianSmoothing(nn.Module):
|
||||
"""
|
||||
Apply gaussian smoothing on a
|
||||
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
||||
1d, 2d or 3d tensor. Filtering is performed separately for each channel
|
||||
in the input using a depthwise convolution.
|
||||
|
||||
Args:
|
||||
|
||||
@ -161,7 +161,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
|
||||
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
|
||||
be masked out with `mask_image` and repainted according to `prompt`.
|
||||
inner_image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
|
||||
`Image`, or tensor representing an image batch which will be overlaid onto `image`. Non-transparent
|
||||
regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
|
||||
the last channel representing the alpha channel, which will be used to blend `inner_image` with
|
||||
`image`. If not provided, it will be forcibly cast to RGBA.
|
||||
|
||||
@ -647,7 +647,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
return sample
|
||||
|
||||
def set_timesteps(
|
||||
self, stength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
|
||||
self, strength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
|
||||
):
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
@ -668,7 +668,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
# LCM Timesteps Setting: # Linear Spacing
|
||||
c = self.config.num_train_timesteps // lcm_origin_steps
|
||||
lcm_origin_timesteps = (
|
||||
np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
|
||||
np.asarray(list(range(1, int(lcm_origin_steps * strength) + 1))) * c - 1
|
||||
) # LCM Training Steps Schedule
|
||||
skipping_step = len(lcm_origin_timesteps) // num_inference_steps
|
||||
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule
|
||||
|
||||
@ -129,7 +129,7 @@ class MagicMixPipeline(DiffusionPipeline):
|
||||
|
||||
input = (
|
||||
(mix_factor * latents) + (1 - mix_factor) * orig_latents
|
||||
) # interpolating between layout noise and conditionally generated noise to preserve layout sematics
|
||||
) # interpolating between layout noise and conditionally generated noise to preserve layout semantics
|
||||
input = torch.cat([input] * 2)
|
||||
|
||||
else: # content generation phase
|
||||
|
||||
@ -196,9 +196,9 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
|
||||
seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overridden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overridden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overridden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too much GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@ -1258,7 +1258,7 @@ class KolorsControlNetPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@ -1462,7 +1462,7 @@ class KolorsControlNetImg2ImgPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@ -1782,7 +1782,7 @@ class KolorsControlNetInpaintPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@ -559,7 +559,7 @@ class FabricPipeline(DiffusionPipeline):
|
||||
End point for providing feedback (between 0 and 1).
|
||||
min_weight (`float`, *optional*, defaults to `.05`):
|
||||
Minimum weight for feedback.
|
||||
max_weight (`float`, *optional*, defults tp `1.0`):
|
||||
max_weight (`float`, *optional*, defaults tp `1.0`):
|
||||
Maximum weight for feedback.
|
||||
neg_scale (`float`, *optional*, defaults to `.5`):
|
||||
Scale factor for negative feedback.
|
||||
|
||||
@ -118,7 +118,7 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> # Here we need use pipeline internal unet model
|
||||
>>> pipe.unet = pipe.unet_model.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True)
|
||||
>>>
|
||||
>>> # Load aditional layers to the model
|
||||
>>> # Load additional layers to the model
|
||||
>>> pipe.unet.load_additional_layers(weight_path="proc_data/faithdiff/FaithDiff.bin", dtype=dtype)
|
||||
>>>
|
||||
>>> # Enable vae tiling
|
||||
|
||||
@ -72,7 +72,7 @@ class GaussianSmoothing(nn.Module):
|
||||
"""
|
||||
Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py
|
||||
Apply gaussian smoothing on a
|
||||
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
||||
1d, 2d or 3d tensor. Filtering is performed separately for each channel
|
||||
in the input using a depthwise convolution.
|
||||
Arguments:
|
||||
channels (int, sequence): Number of channels of the input tensors. Output will
|
||||
|
||||
@ -1509,7 +1509,7 @@ class StableDiffusionXL_AE_Pipeline(
|
||||
|
||||
add_time_ids = add_time_ids.repeat(batch_size, 1).to(DEVICE)
|
||||
|
||||
# interative sampling
|
||||
# interactive sampling
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
latents_list = [latents]
|
||||
pred_x0_list = []
|
||||
@ -1548,7 +1548,7 @@ class StableDiffusionXL_AE_Pipeline(
|
||||
x: torch.FloatTensor,
|
||||
):
|
||||
"""
|
||||
predict the sampe the next step in the denoise process.
|
||||
predict the sample the next step in the denoise process.
|
||||
"""
|
||||
ref_noise = model_output[:1, :, :, :].expand(model_output.shape)
|
||||
alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
|
||||
|
||||
@ -132,7 +132,7 @@ def _preprocess_adapter_image(image, height, width):
|
||||
image = torch.cat(image, dim=0)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
|
||||
)
|
||||
return image
|
||||
|
||||
|
||||
@ -150,7 +150,7 @@ def _preprocess_adapter_image(image, height, width):
|
||||
image = torch.cat(image, dim=0)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
|
||||
)
|
||||
return image
|
||||
|
||||
|
||||
@ -220,7 +220,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
revers = True
|
||||
|
||||
def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None):
|
||||
if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps
|
||||
if "PRO" in mode: # in Prompt mode, make masks from sum of attention maps
|
||||
self.step = step
|
||||
|
||||
if len(self.attnmaps_sizes) > 3:
|
||||
@ -552,9 +552,9 @@ def get_attn_maps(self, attn):
|
||||
|
||||
def reset_attnmaps(self): # init parameters in every batch
|
||||
self.step = 0
|
||||
self.attnmaps = {} # maked from attention maps
|
||||
self.attnmaps = {} # made from attention maps
|
||||
self.attnmaps_sizes = [] # height,width set of u-net blocks
|
||||
self.attnmasks = {} # maked from attnmaps for regions
|
||||
self.attnmasks = {} # made from attnmaps for regions
|
||||
self.maskready = False
|
||||
self.history = {}
|
||||
|
||||
|
||||
@ -97,7 +97,7 @@ class SdeDragPipeline(DiffusionPipeline):
|
||||
steps (`int`, *optional*, defaults to 200):
|
||||
The number of sampling iterations.
|
||||
step_size (`int`, *optional*, defaults to 2):
|
||||
The drag diatance of each drag step.
|
||||
The drag distance of each drag step.
|
||||
image_scale (`float`, *optional*, defaults to 0.3):
|
||||
To avoid duplicating the content, use image_scale to perturbs the source.
|
||||
adapt_radius (`int`, *optional*, defaults to 5):
|
||||
|
||||
@ -284,7 +284,7 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline):
|
||||
)
|
||||
else:
|
||||
raise AssertionError(
|
||||
f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively"
|
||||
f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} respectively"
|
||||
)
|
||||
|
||||
original_image_embeddings = self._encode_image(
|
||||
|
||||
@ -1012,7 +1012,7 @@ def main(args):
|
||||
unet = get_peft_model(unet, lora_config)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -829,7 +829,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 8. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -1026,7 +1026,7 @@ def main(args):
|
||||
unet = get_peft_model(unet, lora_config)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -962,7 +962,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -1021,7 +1021,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -411,7 +411,7 @@ export CAPTION_COLUMN='caption_column'
|
||||
|
||||
export CACHE_DIR="/data/train_csr/.cache/huggingface/"
|
||||
export OUTPUT_DIR='/data/train_csr/FLUX/MODEL_OUT/'$MODEL_TYPE
|
||||
# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using acclerate would cause problems.)
|
||||
# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using accelerate would cause problems.)
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 train_controlnet_flux.py \
|
||||
|
||||
|
||||
@ -173,13 +173,13 @@ accelerate launch train_dreambooth_lora_flux.py \
|
||||
### Target Modules
|
||||
When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them.
|
||||
More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore
|
||||
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
|
||||
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma separated string
|
||||
the exact modules for LoRA training. Here are some examples of target modules you can provide:
|
||||
- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
|
||||
- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
|
||||
- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
|
||||
> [!NOTE]
|
||||
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
|
||||
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma separated string:
|
||||
> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
|
||||
> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k`
|
||||
> [!NOTE]
|
||||
|
||||
@ -107,7 +107,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--rank`: The rank of the LoRA layers. The higher the rank, the more parameters are trained. The default is 16.
|
||||
|
||||
We provide several options for optimizing memory optimization:
|
||||
|
||||
@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--system_prompt`: A custom system prompt to provide additional personality to the model.
|
||||
* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
|
||||
|
||||
|
||||
@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55).
|
||||
* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
|
||||
|
||||
|
||||
@ -567,7 +567,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@ -596,7 +596,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@ -514,7 +514,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@ -513,7 +513,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@ -576,7 +576,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The transformer block layers to apply LoRA training on. Please specify the layers in a comma seperated string."
|
||||
"The transformer block layers to apply LoRA training on. Please specify the layers in a comma separated string."
|
||||
"For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md"
|
||||
),
|
||||
)
|
||||
@ -585,7 +585,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma seperated manner."
|
||||
"The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma separated manner."
|
||||
'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md'
|
||||
),
|
||||
)
|
||||
|
||||
@ -664,7 +664,7 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
|
||||
@ -329,7 +329,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@ -400,7 +400,7 @@ def main():
|
||||
|
||||
image_encoder.requires_grad_(False)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -1147,7 +1147,7 @@ def main(args):
|
||||
tracker_config = dict(vars(args))
|
||||
accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
|
||||
|
||||
# Function for unwraping if torch.compile() was used in accelerate.
|
||||
# Function for unwrapping if torch.compile() was used in accelerate.
|
||||
def unwrap_model(model):
|
||||
model = accelerator.unwrap_model(model)
|
||||
model = model._orig_mod if is_compiled_module(model) else model
|
||||
|
||||
@ -69,7 +69,7 @@ accelerate launch --config_file=accelerate.yaml \
|
||||
--seed="0"
|
||||
```
|
||||
|
||||
We can direcly pass a quantized checkpoint path, too:
|
||||
We can directly pass a quantized checkpoint path, too:
|
||||
|
||||
```diff
|
||||
+ --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg"
|
||||
|
||||
@ -13,7 +13,7 @@ args = parser.parse_args()
|
||||
|
||||
|
||||
device = "cpu"
|
||||
prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"
|
||||
prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings"
|
||||
|
||||
model_id = "path-to-your-trained-model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id)
|
||||
|
||||
@ -80,7 +80,7 @@ export INT8_MODEL_NAME="./int8_model"
|
||||
|
||||
python text2images.py \
|
||||
--pretrained_model_name_or_path=$INT8_MODEL_NAME \
|
||||
--caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \
|
||||
--caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings." \
|
||||
--images_num 4
|
||||
```
|
||||
|
||||
|
||||
@ -664,7 +664,7 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
|
||||
# &
|
||||
caption = re.sub(r"&", "", caption)
|
||||
|
||||
# ip adresses:
|
||||
# ip addresses:
|
||||
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
|
||||
|
||||
# article ids:
|
||||
|
||||
@ -612,7 +612,7 @@ def main():
|
||||
# See Section 3.1. of the paper.
|
||||
max_length = 120
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, text_encoder) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -120,11 +120,11 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev")
|
||||
parser.add_argument("--width", type=int, default=1024, help="width of the image to generate")
|
||||
parser.add_argument("--height", type=int, default=1024, help="height of the image to generate")
|
||||
parser.add_argument("--guidance", type=float, default=3.5, help="gauidance strentgh for dev")
|
||||
parser.add_argument("--guidance", type=float, default=3.5, help="guidance strength for dev")
|
||||
parser.add_argument("--seed", type=int, default=None, help="seed for inference")
|
||||
parser.add_argument("--profile", action="store_true", help="enable profiling")
|
||||
parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.")
|
||||
parser.add_argument("--itters", type=int, default=15, help="tiems to run inference and get avg time in sec.")
|
||||
parser.add_argument("--itters", type=int, default=15, help="items to run inference and get avg time in sec.")
|
||||
args = parser.parse_args()
|
||||
if args.schnell:
|
||||
ckpt_id = "black-forest-labs/FLUX.1-schnell"
|
||||
|
||||
@ -759,7 +759,7 @@ def main(args):
|
||||
unet, text_encoder, optimizer, train_dataloader
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -661,7 +661,7 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
|
||||
@ -789,7 +789,7 @@ def main():
|
||||
text_encoder, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -814,7 +814,7 @@ def main():
|
||||
text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@ -220,7 +220,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(
|
||||
f"double_blocks.{i}.txt_attn.proj.bias"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
# norm.linear <- single_blocks.0.modulation.lin
|
||||
|
||||
@ -394,7 +394,7 @@ if __name__ == "__main__":
|
||||
help="Scheduler type to use. Use 'scm' for Sana Sprint models.",
|
||||
)
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
|
||||
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipelien elemets in one.")
|
||||
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipeline elements in one.")
|
||||
parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -984,7 +984,7 @@ def renderer(*, args, checkpoint_map_location):
|
||||
return renderer_model
|
||||
|
||||
|
||||
# prior model will expect clip_mean and clip_std, whic are missing from the state_dict
|
||||
# prior model will expect clip_mean and clip_std, which are missing from the state_dict
|
||||
PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
|
||||
|
||||
|
||||
|
||||
@ -55,8 +55,8 @@ for key in orig_state_dict.keys():
|
||||
state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
|
||||
else:
|
||||
state_dict[key] = orig_state_dict[key]
|
||||
deocder = WuerstchenDiffNeXt()
|
||||
deocder.load_state_dict(state_dict)
|
||||
decoder = WuerstchenDiffNeXt()
|
||||
decoder.load_state_dict(state_dict)
|
||||
|
||||
# Prior
|
||||
orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"]
|
||||
@ -94,7 +94,7 @@ prior_pipeline = WuerstchenPriorPipeline(
|
||||
prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
|
||||
|
||||
decoder_pipeline = WuerstchenDecoderPipeline(
|
||||
text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=deocder, scheduler=scheduler
|
||||
text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=decoder, scheduler=scheduler
|
||||
)
|
||||
decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
|
||||
|
||||
@ -103,7 +103,7 @@ wuerstchen_pipeline = WuerstchenCombinedPipeline(
|
||||
# Decoder
|
||||
text_encoder=gen_text_encoder,
|
||||
tokenizer=gen_tokenizer,
|
||||
decoder=deocder,
|
||||
decoder=decoder,
|
||||
scheduler=scheduler,
|
||||
vqgan=vqmodel,
|
||||
# Prior
|
||||
|
||||
@ -243,7 +243,7 @@ class GroupOffloadingHook(ModelHook):
|
||||
|
||||
class LazyPrefetchGroupOffloadingHook(ModelHook):
|
||||
r"""
|
||||
A hook, used in conjuction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
|
||||
A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
|
||||
This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
|
||||
invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
|
||||
prefetching groups in the correct order.
|
||||
|
||||
@ -90,7 +90,7 @@ class PeftInputAutocastDisableHook(ModelHook):
|
||||
that the inputs are casted to the computation dtype correctly always. However, there are two goals we are
|
||||
hoping to achieve:
|
||||
1. Making forward implementations independent of device/dtype casting operations as much as possible.
|
||||
2. Peforming inference without losing information from casting to different precisions. With the current
|
||||
2. Performing inference without losing information from casting to different precisions. With the current
|
||||
PEFT implementation (as linked in the reference above), and assuming running layerwise casting inference
|
||||
with storage_dtype=torch.float8_e4m3fn and compute_dtype=torch.bfloat16, inputs are cast to
|
||||
torch.float8_e4m3fn in the lora layer. We will then upcast back to torch.bfloat16 when we continue the
|
||||
|
||||
@ -819,7 +819,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_pe:
|
||||
logger.info(
|
||||
"The `position_embedding` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
|
||||
else:
|
||||
@ -835,7 +835,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_t5:
|
||||
logger.info(
|
||||
"The `t5xxl` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@ -850,7 +850,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_diff_b:
|
||||
logger.info(
|
||||
"The `diff_b` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@ -866,7 +866,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_diff:
|
||||
logger.info(
|
||||
"The `diff` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@ -1237,7 +1237,7 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
|
||||
f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
|
||||
|
||||
@ -2413,7 +2413,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
) -> bool:
|
||||
"""
|
||||
Control LoRA expands the shape of the input layer from (3072, 64) to (3072, 128). This method handles that and
|
||||
generalizes things a bit so that any parameter that needs expansion receives appropriate treatement.
|
||||
generalizes things a bit so that any parameter that needs expansion receives appropriate treatment.
|
||||
"""
|
||||
state_dict = {}
|
||||
if lora_state_dict is not None:
|
||||
|
||||
@ -330,7 +330,7 @@ class PeftAdapterMixin:
|
||||
new_sd[k] = v
|
||||
return new_sd
|
||||
|
||||
# To handle scenarios where we cannot successfully set state dict. If it's unsucessful,
|
||||
# To handle scenarios where we cannot successfully set state dict. If it's unsuccessful,
|
||||
# we should also delete the `peft_config` associated to the `adapter_name`.
|
||||
try:
|
||||
if hotswap:
|
||||
@ -344,7 +344,7 @@ class PeftAdapterMixin:
|
||||
config=lora_config,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Hotswapping {adapter_name} was unsucessful with the following error: \n{e}")
|
||||
logger.error(f"Hotswapping {adapter_name} was unsuccessful with the following error: \n{e}")
|
||||
raise
|
||||
# the hotswap function raises if there are incompatible keys, so if we reach this point we can set
|
||||
# it to None
|
||||
@ -379,7 +379,7 @@ class PeftAdapterMixin:
|
||||
module.delete_adapter(adapter_name)
|
||||
|
||||
self.peft_config.pop(adapter_name)
|
||||
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
|
||||
logger.error(f"Loading {adapter_name} was unsuccessful with the following error: \n{e}")
|
||||
raise
|
||||
|
||||
warn_msg = ""
|
||||
@ -712,7 +712,7 @@ class PeftAdapterMixin:
|
||||
if self.lora_scale != 1.0:
|
||||
module.scale_layer(self.lora_scale)
|
||||
|
||||
# For BC with prevous PEFT versions, we need to check the signature
|
||||
# For BC with previous PEFT versions, we need to check the signature
|
||||
# of the `merge` method to see if it supports the `adapter_names` argument.
|
||||
supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
|
||||
if "adapter_names" in supported_merge_kwargs:
|
||||
|
||||
@ -453,7 +453,7 @@ class FromSingleFileMixin:
|
||||
logger.warning(
|
||||
"Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n"
|
||||
"This may lead to errors if the model components are not correctly inferred. \n"
|
||||
"To avoid this warning, please explicity pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
|
||||
"To avoid this warning, please explicitly pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
|
||||
"e.g. `from_single_file(<my model checkpoint path>, config=<path to local diffusers model repo>) \n"
|
||||
"or run `from_single_file` with `local_files_only=False` first to update the local cache directory with "
|
||||
"the necessary config files.\n"
|
||||
|
||||
@ -2278,7 +2278,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
f"double_blocks.{i}.txt_attn.proj.bias"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
# norm.linear <- single_blocks.0.modulation.lin
|
||||
@ -2872,7 +2872,7 @@ def convert_auraflow_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
def convert_lumina2_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict = {}
|
||||
|
||||
# Original Lumina-Image-2 has an extra norm paramter that is unused
|
||||
# Original Lumina-Image-2 has an extra norm parameter that is unused
|
||||
# We just remove it here
|
||||
checkpoint.pop("norm_final.weight", None)
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ class SD3Transformer2DLoadersMixin:
|
||||
key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
|
||||
updated_state_dict[key] = value
|
||||
|
||||
# Image projetion parameters
|
||||
# Image projection parameters
|
||||
embed_dim = updated_state_dict["proj_in.weight"].shape[1]
|
||||
output_dim = updated_state_dict["proj_out.weight"].shape[0]
|
||||
hidden_dim = updated_state_dict["proj_in.weight"].shape[0]
|
||||
|
||||
@ -734,17 +734,17 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
|
||||
unet (`UNet2DConditionModel`):
|
||||
The UNet model we want to control.
|
||||
controlnet (`ControlNetXSAdapter`):
|
||||
The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS
|
||||
The ControlNet-XS adapter with which the UNet will be fused. If none is given, a new ControlNet-XS
|
||||
adapter will be created.
|
||||
size_ratio (float, *optional*, defaults to `None`):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
|
||||
where this parameter is called `block_out_channels`.
|
||||
time_embedding_mix (`float`, *optional*, defaults to None):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
|
||||
Passed to the `init` of the new controlent if no controlent was given.
|
||||
Passed to the `init` of the new controlnet if no controlnet was given.
|
||||
"""
|
||||
if controlnet is None:
|
||||
controlnet = ControlNetXSAdapter.from_unet(
|
||||
|
||||
@ -97,7 +97,7 @@ def get_3d_sincos_pos_embed(
|
||||
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
|
||||
spatial dimensions (height and width).
|
||||
temporal_size (`int`):
|
||||
The temporal dimension of postional embeddings (number of frames).
|
||||
The temporal dimension of positional embeddings (number of frames).
|
||||
spatial_interpolation_scale (`float`, defaults to 1.0):
|
||||
Scale factor for spatial grid interpolation.
|
||||
temporal_interpolation_scale (`float`, defaults to 1.0):
|
||||
@ -169,7 +169,7 @@ def _get_3d_sincos_pos_embed_np(
|
||||
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
|
||||
spatial dimensions (height and width).
|
||||
temporal_size (`int`):
|
||||
The temporal dimension of postional embeddings (number of frames).
|
||||
The temporal dimension of positional embeddings (number of frames).
|
||||
spatial_interpolation_scale (`float`, defaults to 1.0):
|
||||
Scale factor for spatial grid interpolation.
|
||||
temporal_interpolation_scale (`float`, defaults to 1.0):
|
||||
|
||||
@ -30,7 +30,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
|
||||
_supports_gradient_checkpointing = True
|
||||
|
||||
"""
|
||||
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, offical code:
|
||||
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, official code:
|
||||
https://github.com/Vchitect/Latte
|
||||
|
||||
Parameters:
|
||||
@ -216,7 +216,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
|
||||
)
|
||||
num_patches = height * width
|
||||
|
||||
hidden_states = self.pos_embed(hidden_states) # alrady add positional embeddings
|
||||
hidden_states = self.pos_embed(hidden_states) # already add positional embeddings
|
||||
|
||||
added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
|
||||
timestep, embedded_timestep = self.adaln_single(
|
||||
|
||||
@ -43,7 +43,7 @@ class LuminaNextDiTBlock(nn.Module):
|
||||
num_kv_heads (`int`):
|
||||
Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
|
||||
multiple_of (`int`): The number of multiple of ffn layer.
|
||||
ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
|
||||
ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
|
||||
norm_eps (`float`): The eps for norm layer.
|
||||
qk_norm (`bool`): normalization for query and key.
|
||||
cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
|
||||
|
||||
@ -154,7 +154,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
||||
# of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
|
||||
# is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
|
||||
# This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
|
||||
# without running proper depcrecation cycles for the {down,mid,up} blocks which are a
|
||||
# without running proper deprecation cycles for the {down,mid,up} blocks which are a
|
||||
# part of the public API.
|
||||
num_attention_heads = attention_head_dim
|
||||
|
||||
|
||||
@ -131,7 +131,7 @@ class AmusedPipeline(DiffusionPipeline):
|
||||
generation deterministic.
|
||||
latents (`torch.IntTensor`, *optional*):
|
||||
Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
|
||||
gneration. If not provided, the starting latents will be completely masked.
|
||||
generation. If not provided, the starting latents will be completely masked.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
||||
provided, text embeddings are generated from the `prompt` input argument. A single vector from the
|
||||
|
||||
@ -373,7 +373,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
*e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
|
||||
`negative_prompt` input argument.
|
||||
generated_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
|
||||
Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
|
||||
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
|
||||
argument.
|
||||
negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
@ -394,7 +394,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
attention_mask (`torch.LongTensor`):
|
||||
Attention mask to be applied to the `prompt_embeds`.
|
||||
generated_prompt_embeds (`torch.Tensor`):
|
||||
Text embeddings generated from the GPT2 langauge model.
|
||||
Text embeddings generated from the GPT2 language model.
|
||||
|
||||
Example:
|
||||
|
||||
@ -904,7 +904,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
generated_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
|
||||
Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
|
||||
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
|
||||
argument.
|
||||
negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
|
||||
|
||||
@ -138,7 +138,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
|
||||
def get_query_embeddings(self, input_image, src_subject):
|
||||
return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
|
||||
|
||||
# from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
|
||||
# from the original Blip Diffusion code, specifies the target subject and augments the prompt by repeating it
|
||||
def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
|
||||
rv = []
|
||||
for prompt, tgt_subject in zip(prompts, tgt_subjects):
|
||||
|
||||
@ -149,7 +149,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
|
||||
def get_query_embeddings(self, input_image, src_subject):
|
||||
return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
|
||||
|
||||
# from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
|
||||
# from the original Blip Diffusion code, specifies the target subject and augments the prompt by repeating it
|
||||
def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
|
||||
rv = []
|
||||
for prompt, tgt_subject in zip(prompts, tgt_subjects):
|
||||
|
||||
@ -739,7 +739,7 @@ class StableDiffusionControlNetXSPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
|
||||
@ -880,7 +880,7 @@ class StableDiffusionXLControlNetXSPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@ -97,7 +97,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
|
||||
for i, audio in enumerate(audios):
|
||||
write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
|
||||
|
||||
# To dislay in google colab
|
||||
# To display in google colab
|
||||
import IPython.display as ipd
|
||||
|
||||
for audio in audios:
|
||||
|
||||
@ -509,7 +509,8 @@ class StableDiffusionModelEditingPipeline(
|
||||
The destination prompt. Must contain all words from `source_prompt` with additional ones to specify the
|
||||
target edit.
|
||||
lamb (`float`, *optional*, defaults to 0.1):
|
||||
The lambda parameter specifying the regularization intesity. Smaller values increase the editing power.
|
||||
The lambda parameter specifying the regularization intensity. Smaller values increase the editing
|
||||
power.
|
||||
restart_params (`bool`, *optional*, defaults to True):
|
||||
Restart the model parameters to their pre-trained version before editing. This is done to avoid edit
|
||||
compounding. When it is `False`, edits accumulate.
|
||||
|
||||
@ -1097,7 +1097,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
|
||||
added_cond_kwargs: (`dict`, *optional*):
|
||||
A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
|
||||
A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
|
||||
are passed along to the UNet blocks.
|
||||
down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
|
||||
additional residuals to be added to UNet long skip connections from down blocks to up blocks for
|
||||
|
||||
@ -478,7 +478,7 @@ class AnimateDiffFreeNoiseMixin:
|
||||
Must be one of ["shuffle_context", "repeat_context", "random"].
|
||||
- "shuffle_context"
|
||||
Shuffles a fixed batch of `context_length` latents to create a final latent of size
|
||||
`num_frames`. This is usually the best setting for most generation scenarious. However, there
|
||||
`num_frames`. This is usually the best setting for most generation scenarios. However, there
|
||||
might be visible repetition noticeable in the kinds of motion/animation generated.
|
||||
- "repeated_context"
|
||||
Repeats a fixed batch of `context_length` latents to create a final latent of size
|
||||
|
||||
@ -462,7 +462,7 @@ class I2VGenXLPipeline(
|
||||
image_latents = image_latents.unsqueeze(2)
|
||||
|
||||
# Append a position mask for each subsequent frame
|
||||
# after the intial image latent frame
|
||||
# after the initial image latent frame
|
||||
frame_position_mask = []
|
||||
for frame_idx in range(num_frames - 1):
|
||||
scale = (frame_idx + 1) / (num_frames - 1)
|
||||
|
||||
@ -496,7 +496,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
||||
"As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
|
||||
"This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
|
||||
"THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
|
||||
"This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
|
||||
"This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
|
||||
)
|
||||
self._warn_has_been_called = True
|
||||
|
||||
|
||||
@ -386,7 +386,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
|
||||
"As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
|
||||
"This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
|
||||
"THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
|
||||
"This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
|
||||
"This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
|
||||
)
|
||||
self._warn_has_been_called = True
|
||||
|
||||
|
||||
@ -668,7 +668,7 @@ class Embedding(torch.nn.Module):
|
||||
# Embeddings.
|
||||
words_embeddings = self.word_embeddings(input_ids)
|
||||
embeddings = words_embeddings
|
||||
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
|
||||
# Data format change to avoid explicit transposes : [b s h] --> [s b h].
|
||||
embeddings = embeddings.transpose(0, 1).contiguous()
|
||||
# If the input flag for fp32 residual connection is set, convert for float.
|
||||
if self.fp32_residual_connection:
|
||||
|
||||
@ -1458,7 +1458,7 @@ def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, e
|
||||
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
|
||||
pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
|
||||
|
||||
# modifed so that updated xtm1 is returned as well (to avoid error accumulation)
|
||||
# modified so that updated xtm1 is returned as well (to avoid error accumulation)
|
||||
mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
|
||||
if variance > 0.0:
|
||||
noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
|
||||
|
||||
@ -1742,7 +1742,7 @@ def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, e
|
||||
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
|
||||
pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
|
||||
|
||||
# modifed so that updated xtm1 is returned as well (to avoid error accumulation)
|
||||
# modified so that updated xtm1 is returned as well (to avoid error accumulation)
|
||||
mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
|
||||
if variance > 0.0:
|
||||
noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
|
||||
|
||||
@ -426,7 +426,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
if isinstance(img, np.ndarray):
|
||||
img = torch.from_numpy(img)
|
||||
if not torch.is_floating_point(img):
|
||||
raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
|
||||
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
||||
else:
|
||||
raise ValueError(f"{prefix}: unexpected type={type(img)}.")
|
||||
if val_min != 0.0 or val_max != 1.0:
|
||||
@ -464,7 +464,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
||||
if torch.is_tensor(img):
|
||||
img = img.cpu().numpy()
|
||||
if not np.issubdtype(img.dtype, np.floating):
|
||||
raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
|
||||
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
||||
if val_min != 0.0 or val_max != 1.0:
|
||||
img = (img - val_min) / (val_max - val_min)
|
||||
img = (img * (2**16 - 1)).astype(np.uint16)
|
||||
|
||||
@ -176,7 +176,7 @@ class OmniGenPipeline(
|
||||
get the continue embedding of input images by VAE
|
||||
|
||||
Args:
|
||||
input_pixel_values: normlized pixel of input images
|
||||
input_pixel_values: normalized pixel of input images
|
||||
device:
|
||||
Returns: torch.Tensor
|
||||
"""
|
||||
|
||||
@ -115,7 +115,7 @@ EXAMPLE_DOC_STRING = """
|
||||
... with torch.no_grad(), torch.autocast("cuda"):
|
||||
... depth_map = depth_estimator(image).predicted_depth
|
||||
|
||||
... depth_map = torch.nn.fuctional.interpolate(
|
||||
... depth_map = torch.nn.functional.interpolate(
|
||||
... depth_map.unsqueeze(1),
|
||||
... size=(1024, 1024),
|
||||
... mode="bicubic",
|
||||
|
||||
@ -1038,7 +1038,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
|
||||
textures = _convert_srgb_to_linear(textures)
|
||||
textures = textures.float()
|
||||
|
||||
# 3.3 augument the mesh with texture data
|
||||
# 3.3 augment the mesh with texture data
|
||||
assert len(textures.shape) == 3 and textures.shape[-1] == len(texture_channels), (
|
||||
f"expected [meta_batch x inner_batch x texture_channels] field results, but got {textures.shape}"
|
||||
)
|
||||
|
||||
@ -524,9 +524,9 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
|
||||
latents = self.vqgan.config.scale_factor * latents
|
||||
images = self.vqgan.decode(latents).sample.clamp(0, 1)
|
||||
if output_type == "np":
|
||||
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesnt work
|
||||
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
|
||||
elif output_type == "pil":
|
||||
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesnt work
|
||||
images = images.permute(0, 2, 3, 1).cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
|
||||
images = self.numpy_to_pil(images)
|
||||
else:
|
||||
images = latents
|
||||
|
||||
@ -626,11 +626,11 @@ class StableCascadePriorPipeline(DiffusionPipeline):
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if output_type == "np":
|
||||
latents = latents.cpu().float().numpy() # float() as bfloat16-> numpy doesnt work
|
||||
prompt_embeds = prompt_embeds.cpu().float().numpy() # float() as bfloat16-> numpy doesnt work
|
||||
latents = latents.cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
|
||||
prompt_embeds = prompt_embeds.cpu().float().numpy() # float() as bfloat16-> numpy doesn't work
|
||||
negative_prompt_embeds = (
|
||||
negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None
|
||||
) # float() as bfloat16-> numpy doesnt work
|
||||
) # float() as bfloat16-> numpy doesn't work
|
||||
|
||||
if not return_dict:
|
||||
return (
|
||||
|
||||
@ -1047,7 +1047,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
|
||||
class GaussianSmoothing(torch.nn.Module):
|
||||
"""
|
||||
Arguments:
|
||||
Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed seperately for each channel in the input
|
||||
Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed separately for each channel in the input
|
||||
using a depthwise convolution.
|
||||
channels (int, sequence): Number of channels of the input tensors. Output will
|
||||
have this number of channels as well.
|
||||
|
||||
@ -123,7 +123,7 @@ class StableDiffusionKDiffusionPipeline(
|
||||
super().__init__()
|
||||
|
||||
logger.info(
|
||||
f"{self.__class__} is an experimntal pipeline and is likely to change in the future. We recommend to use"
|
||||
f"{self.__class__} is an experimental pipeline and is likely to change in the future. We recommend to use"
|
||||
" this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines"
|
||||
" as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for"
|
||||
" production settings."
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user