Compare commits

..

1 Commits

Author SHA1 Message Date
Dhruv Nair bde39cb9d7 update 2024-08-29 12:24:44 +00:00
8 changed files with 6 additions and 32 deletions
-11
View File
@@ -77,21 +77,10 @@ CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds o
- `pipe.enable_model_cpu_offload()`:
- Without enabling cpu offloading, memory usage is `33 GB`
- With enabling cpu offloading, memory usage is `19 GB`
- `pipe.enable_sequential_cpu_offload()`:
- Similar to `enable_model_cpu_offload` but can significantly reduce memory usage at the cost of slow inference
- When enabled, memory usage is under `4 GB`
- `pipe.vae.enable_tiling()`:
- With enabling cpu offloading and tiling, memory usage is `11 GB`
- `pipe.vae.enable_slicing()`
### Quantized inference
[torchao](https://github.com/pytorch/ao) and [optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be used to quantize the text encoder, transformer and VAE modules to lower the memory requirements. This makes it possible to run the model on a free-tier T4 Colab or lower VRAM GPUs!
It is also worth noting that torchao quantization is fully compatible with [torch.compile](/optimization/torch2.0#torchcompile), which allows for much faster inference speed. Additionally, models can be serialized and stored in a quantized datatype to save disk space with torchao. Find examples and benchmarks in the gists below.
- [torchao](https://gist.github.com/a-r-r-o-w/4d9732d17412888c885480c6521a9897)
- [quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)
## CogVideoXPipeline
[[autodoc]] CogVideoXPipeline
+2 -2
View File
@@ -91,11 +91,11 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
"xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"},
"playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"},
"upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"},
"inpainting": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8-inpainting"},
"inpainting": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-inpainting"},
"inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
"controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
"v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
"v1": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8"},
"v1": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5"},
"stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
"stable_cascade_stage_b_lite": {
"pretrained_model_name_or_path": "stabilityai/stable-cascade",
+4 -7
View File
@@ -545,14 +545,11 @@ def get_1d_rotary_pos_embed(
assert dim % 2 == 0
if isinstance(pos, int):
pos = torch.arange(pos)
if isinstance(pos, np.ndarray):
pos = torch.from_numpy(pos) # type: ignore # [S]
pos = np.arange(pos)
theta = theta * ntk_factor
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor # [D/2]
freqs = freqs.to(pos.device)
freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S]
freqs = torch.outer(t, freqs) # type: ignore # [S, D/2]
if use_real and repeat_interleave_real:
# flux, hunyuan-dit, cogvideox
freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
@@ -629,7 +626,7 @@ class FluxPosEmbed(nn.Module):
n_axes = ids.shape[-1]
cos_out = []
sin_out = []
pos = ids.squeeze().float()
pos = ids.squeeze().float().cpu().numpy()
is_mps = ids.device.type == "mps"
freqs_dtype = torch.float32 if is_mps else torch.float64
for i in range(n_axes):
-3
View File
@@ -417,9 +417,6 @@ class ModelTesterMixin:
@require_torch_gpu
def test_set_attn_processor_for_determinism(self):
if self.uses_custom_attn_processor:
return
torch.use_deterministic_algorithms(False)
if self.forward_requires_fresh_args:
model = self.model_class(**self.init_dict)
@@ -32,9 +32,6 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
# We override the items here because the transformer under consideration is small.
model_split_percents = [0.7, 0.6, 0.6]
# Skip setting testing with default: AttnProcessor
uses_custom_attn_processor = True
@property
def dummy_input(self):
batch_size = 1
@@ -25,9 +25,6 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
batch_params = frozenset(["prompt"])
# there is no xformers processor for Flux
test_xformers_attention = False
def get_dummy_components(self):
torch.manual_seed(0)
transformer = FluxTransformer2DModel(
-1
View File
@@ -37,7 +37,6 @@ class StableDiffusion3PAGPipelineFastTests(unittest.TestCase, PipelineTesterMixi
]
)
batch_params = frozenset(["prompt", "negative_prompt"])
test_xformers_attention = False
def get_dummy_components(self):
torch.manual_seed(0)
@@ -68,8 +68,6 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
"callback_steps",
]
)
# There is not xformers version of the StableAudioPipeline custom attention processor
test_xformers_attention = False
def get_dummy_components(self):
torch.manual_seed(0)