Compare commits

...

11 Commits

Author SHA1 Message Date
Sayak Paul 7394b99047 Merge branch 'main' into fix-casting-training-params 2024-06-18 14:45:33 +01:00
Sayak Paul 4edde134f6 [SD3 training] refactor the density and weighting utilities. (#8591)
refactor the density and weighting utilities.
2024-06-18 14:44:38 +01:00
Sayak Paul 4716a413bf Merge branch 'main' into fix-casting-training-params 2024-06-18 14:18:05 +01:00
Bagheera 074a7cc3c5 SD3: update default training timestep / loss weighting distribution to logit_normal (#8592)
Co-authored-by: bghira <bghira@users.github.com>
Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
2024-06-18 14:15:19 +01:00
Álvaro Somoza 6bfd13f07a [SD3 Training] T5 token limit (#8564)
* initial commit

* default back to 77

* better text

* text correction

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
2024-06-17 16:32:56 -04:00
Sayak Paul c843fb25ec Merge branch 'main' into fix-casting-training-params 2024-06-17 20:43:54 +01:00
AmosDinh eeb70033a6 Syntax error in readme example "pipe" -> "pipeline" (#8601)
Update controlnet.md

Syntax error pipe -> pipeline
2024-06-17 11:02:07 -07:00
Dhruv Nair c4a4750cb3 Temporarily pin Numpy in the CI (#8603)
temp pin numpy
2024-06-17 19:32:38 +05:30
Sayak Paul 38d768a876 Merge branch 'main' into fix-casting-training-params 2024-06-11 13:09:41 +01:00
Sayak Paul 2c35ea66d9 Merge branch 'main' into fix-casting-training-params 2024-06-10 13:57:27 +01:00
sayakpaul 34fbd5526a fix the position of param casting when loading them 2024-06-10 13:53:59 +01:00
16 changed files with 134 additions and 68 deletions
+1 -1
View File
@@ -42,7 +42,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
+1 -1
View File
@@ -40,7 +40,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers
+2 -2
View File
@@ -41,8 +41,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
hf-doc-builder \
huggingface-hub \
Jinja2 \
librosa \
numpy \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers
+1 -1
View File
@@ -40,7 +40,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers
+1 -1
View File
@@ -40,7 +40,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers
@@ -39,7 +39,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers
+1 -1
View File
@@ -40,7 +40,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers matplotlib
+1 -1
View File
@@ -39,7 +39,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
@@ -39,7 +39,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
huggingface-hub \
Jinja2 \
librosa \
numpy \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
+2 -2
View File
@@ -349,7 +349,7 @@ control_image = load_image("./conditioning_image_1.png")
prompt = "pale golden rod circle with old lace background"
generator = torch.manual_seed(0)
image = pipe(prompt, num_inference_steps=20, generator=generator, image=control_image).images[0]
image = pipeline(prompt, num_inference_steps=20, generator=generator, image=control_image).images[0]
image.save("./output.png")
```
@@ -363,4 +363,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
Congratulations on training your own ControlNet! To learn more about how to use your new model, the following guides may be helpful:
- Learn how to [use a ControlNet](../using-diffusers/controlnet) for inference on a variety of tasks.
- Learn how to [use a ControlNet](../using-diffusers/controlnet) for inference on a variety of tasks.
+3
View File
@@ -106,6 +106,9 @@ To better track our training experiments, we're using the following flags in the
* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
> [!NOTE]
> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
> [!TIP]
> You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.
@@ -53,7 +53,11 @@ from diffusers import (
StableDiffusion3Pipeline,
)
from diffusers.optimization import get_scheduler
from diffusers.training_utils import cast_training_params
from diffusers.training_utils import (
cast_training_params,
compute_density_for_timestep_sampling,
compute_loss_weighting_for_sd3,
)
from diffusers.utils import (
check_min_version,
convert_unet_state_dict_to_peft,
@@ -298,6 +302,12 @@ def parse_args(input_args=None):
default=None,
help="The prompt to specify images in the same class as provided instance images.",
)
parser.add_argument(
"--max_sequence_length",
type=int,
default=77,
help="Maximum sequence length to use with with the T5 text encoder",
)
parser.add_argument(
"--validation_prompt",
type=str,
@@ -467,11 +477,20 @@ def parse_args(input_args=None):
),
)
parser.add_argument(
"--weighting_scheme", type=str, default="sigma_sqrt", choices=["sigma_sqrt", "logit_normal", "mode"]
"--weighting_scheme", type=str, default="sigma_sqrt", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"]
)
parser.add_argument(
"--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
)
parser.add_argument(
"--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
)
parser.add_argument(
"--mode_scale",
type=float,
default=1.29,
help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
)
parser.add_argument("--logit_mean", type=float, default=0.0)
parser.add_argument("--logit_std", type=float, default=1.0)
parser.add_argument("--mode_scale", type=float, default=1.29)
parser.add_argument(
"--optimizer",
type=str,
@@ -830,6 +849,7 @@ def tokenize_prompt(tokenizer, prompt):
def _encode_prompt_with_t5(
text_encoder,
tokenizer,
max_sequence_length,
prompt=None,
num_images_per_prompt=1,
device=None,
@@ -840,7 +860,7 @@ def _encode_prompt_with_t5(
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=77,
max_length=max_sequence_length,
truncation=True,
add_special_tokens=True,
return_tensors="pt",
@@ -897,6 +917,7 @@ def encode_prompt(
text_encoders,
tokenizers,
prompt: str,
max_sequence_length,
device=None,
num_images_per_prompt: int = 1,
):
@@ -924,6 +945,7 @@ def encode_prompt(
t5_prompt_embed = _encode_prompt_with_t5(
text_encoders[-1],
tokenizers[-1],
max_sequence_length,
prompt=prompt,
num_images_per_prompt=num_images_per_prompt,
device=device if device is not None else text_encoders[-1].device,
@@ -1297,7 +1319,9 @@ def main(args):
def compute_text_embeddings(prompt, text_encoders, tokenizers):
with torch.no_grad():
prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
prompt_embeds, pooled_prompt_embeds = encode_prompt(
text_encoders, tokenizers, prompt, args.max_sequence_length
)
prompt_embeds = prompt_embeds.to(accelerator.device)
pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
return prompt_embeds, pooled_prompt_embeds
@@ -1466,16 +1490,13 @@ def main(args):
# Sample a random timestep for each image
# for weighting schemes where we sample timesteps non-uniformly
if args.weighting_scheme == "logit_normal":
# See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
u = torch.normal(mean=args.logit_mean, std=args.logit_std, size=(bsz,), device="cpu")
u = torch.nn.functional.sigmoid(u)
elif args.weighting_scheme == "mode":
u = torch.rand(size=(bsz,), device="cpu")
u = 1 - u - args.mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
else:
u = torch.rand(size=(bsz,), device="cpu")
u = compute_density_for_timestep_sampling(
weighting_scheme=args.weighting_scheme,
batch_size=bsz,
logit_mean=args.logit_mean,
logit_std=args.logit_std,
mode_scale=args.mode_scale,
)
indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
@@ -1496,19 +1517,11 @@ def main(args):
# Preconditioning of the model outputs.
model_pred = model_pred * (-sigmas) + noisy_model_input
# TODO (kashif, sayakpaul): weighting sceme needs to be experimented with :)
# these weighting schemes use a uniform timestep sampling
# and instead post-weight the loss
if args.weighting_scheme == "sigma_sqrt":
weighting = (sigmas**-2.0).float()
elif args.weighting_scheme == "cosmap":
bot = 1 - 2 * sigmas + 2 * sigmas**2
weighting = 2 / (math.pi * bot)
else:
weighting = torch.ones_like(sigmas)
weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
# simplified flow matching aka 0-rectified flow matching loss
# target = model_input - noise
# flow matching loss
target = model_input
if args.with_prior_preservation:
@@ -1289,8 +1289,8 @@ def main(args):
models = [unet_]
if args.train_text_encoder:
models.extend([text_encoder_one_, text_encoder_two_])
# only upcast trainable parameters (LoRA) into fp32
cast_training_params(models)
# only upcast trainable parameters (LoRA) into fp32
cast_training_params(models)
accelerator.register_save_state_pre_hook(save_model_hook)
accelerator.register_load_state_pre_hook(load_model_hook)
+36 -25
View File
@@ -51,6 +51,7 @@ from diffusers import (
StableDiffusion3Pipeline,
)
from diffusers.optimization import get_scheduler
from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
from diffusers.utils import (
check_min_version,
is_wandb_available,
@@ -297,6 +298,12 @@ def parse_args(input_args=None):
default=None,
help="The prompt to specify images in the same class as provided instance images.",
)
parser.add_argument(
"--max_sequence_length",
type=int,
default=77,
help="Maximum sequence length to use with with the T5 text encoder",
)
parser.add_argument(
"--validation_prompt",
type=str,
@@ -465,11 +472,20 @@ def parse_args(input_args=None):
),
)
parser.add_argument(
"--weighting_scheme", type=str, default="sigma_sqrt", choices=["sigma_sqrt", "logit_normal", "mode"]
"--weighting_scheme", type=str, default="sigma_sqrt", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"]
)
parser.add_argument(
"--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
)
parser.add_argument(
"--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
)
parser.add_argument(
"--mode_scale",
type=float,
default=1.29,
help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
)
parser.add_argument("--logit_mean", type=float, default=0.0)
parser.add_argument("--logit_std", type=float, default=1.0)
parser.add_argument("--mode_scale", type=float, default=1.29)
parser.add_argument(
"--optimizer",
type=str,
@@ -828,6 +844,7 @@ def tokenize_prompt(tokenizer, prompt):
def _encode_prompt_with_t5(
text_encoder,
tokenizer,
max_sequence_length,
prompt=None,
num_images_per_prompt=1,
device=None,
@@ -838,7 +855,7 @@ def _encode_prompt_with_t5(
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=77,
max_length=max_sequence_length,
truncation=True,
add_special_tokens=True,
return_tensors="pt",
@@ -895,6 +912,7 @@ def encode_prompt(
text_encoders,
tokenizers,
prompt: str,
max_sequence_length,
device=None,
num_images_per_prompt: int = 1,
):
@@ -922,6 +940,7 @@ def encode_prompt(
t5_prompt_embed = _encode_prompt_with_t5(
text_encoders[-1],
tokenizers[-1],
max_sequence_length,
prompt=prompt,
num_images_per_prompt=num_images_per_prompt,
device=device if device is not None else text_encoders[-1].device,
@@ -1324,7 +1343,9 @@ def main(args):
def compute_text_embeddings(prompt, text_encoders, tokenizers):
with torch.no_grad():
prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
prompt_embeds, pooled_prompt_embeds = encode_prompt(
text_encoders, tokenizers, prompt, args.max_sequence_length
)
prompt_embeds = prompt_embeds.to(accelerator.device)
pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
return prompt_embeds, pooled_prompt_embeds
@@ -1530,16 +1551,13 @@ def main(args):
# Sample a random timestep for each image
# for weighting schemes where we sample timesteps non-uniformly
if args.weighting_scheme == "logit_normal":
# See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
u = torch.normal(mean=args.logit_mean, std=args.logit_std, size=(bsz,), device="cpu")
u = torch.nn.functional.sigmoid(u)
elif args.weighting_scheme == "mode":
u = torch.rand(size=(bsz,), device="cpu")
u = 1 - u - args.mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
else:
u = torch.rand(size=(bsz,), device="cpu")
u = compute_density_for_timestep_sampling(
weighting_scheme=args.weighting_scheme,
batch_size=bsz,
logit_mean=args.logit_mean,
logit_std=args.logit_std,
mode_scale=args.mode_scale,
)
indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
@@ -1576,16 +1594,9 @@ def main(args):
model_pred = model_pred * (-sigmas) + noisy_model_input
# these weighting schemes use a uniform timestep sampling
# and instead post-weight the loss
if args.weighting_scheme == "sigma_sqrt":
weighting = (sigmas**-2.0).float()
elif args.weighting_scheme == "cosmap":
bot = 1 - 2 * sigmas + 2 * sigmas**2
weighting = 2 / (math.pi * bot)
else:
weighting = torch.ones_like(sigmas)
weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
# simplified flow matching aka 0-rectified flow matching loss
# target = model_input - noise
# flow matching loss
target = model_input
if args.with_prior_preservation:
@@ -1363,8 +1363,8 @@ def main(args):
models = [unet_]
if args.train_text_encoder:
models.extend([text_encoder_one_, text_encoder_two_])
# only upcast trainable parameters (LoRA) into fp32
cast_training_params(models)
# only upcast trainable parameters (LoRA) into fp32
cast_training_params(models)
accelerator.register_save_state_pre_hook(save_model_hook)
accelerator.register_load_state_pre_hook(load_model_hook)
+39
View File
@@ -1,5 +1,6 @@
import contextlib
import copy
import math
import random
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
@@ -220,6 +221,44 @@ def _set_state_dict_into_text_encoder(
set_peft_model_state_dict(text_encoder, text_encoder_state_dict, adapter_name="default")
def compute_density_for_timestep_sampling(
weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
):
"""Compute the density for sampling the timesteps when doing SD3 training.
Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
"""
if weighting_scheme == "logit_normal":
# See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
u = torch.nn.functional.sigmoid(u)
elif weighting_scheme == "mode":
u = torch.rand(size=(batch_size,), device="cpu")
u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
else:
u = torch.rand(size=(batch_size,), device="cpu")
return u
def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
"""Computes loss weighting scheme for SD3 training.
Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
"""
if weighting_scheme == "sigma_sqrt":
weighting = (sigmas**-2.0).float()
elif weighting_scheme == "cosmap":
bot = 1 - 2 * sigmas + 2 * sigmas**2
weighting = 2 / (math.pi * bot)
else:
weighting = torch.ones_like(sigmas)
return weighting
# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
class EMAModel:
"""