update

2024-08-29 12:24:44 +00:00
8 changed files with 6 additions and 32 deletions
@@ -77,21 +77,10 @@ CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds o
 - `pipe.enable_model_cpu_offload()`:
  - Without enabling cpu offloading, memory usage is `33 GB`
  - With enabling cpu offloading, memory usage is `19 GB`
- `pipe.enable_sequential_cpu_offload()`:
-  - Similar to `enable_model_cpu_offload` but can significantly reduce memory usage at the cost of slow inference
-  - When enabled, memory usage is under `4 GB`
 - `pipe.vae.enable_tiling()`:
  - With enabling cpu offloading and tiling, memory usage is `11 GB`
 - `pipe.vae.enable_slicing()`

-### Quantized inference
-
-[torchao](https://github.com/pytorch/ao) and [optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be used to quantize the text encoder, transformer and VAE modules to lower the memory requirements. This makes it possible to run the model on a free-tier T4 Colab or lower VRAM GPUs!
-
-It is also worth noting that torchao quantization is fully compatible with [torch.compile](/optimization/torch2.0#torchcompile), which allows for much faster inference speed. Additionally, models can be serialized and stored in a quantized datatype to save disk space with torchao. Find examples and benchmarks in the gists below.
- [torchao](https://gist.github.com/a-r-r-o-w/4d9732d17412888c885480c6521a9897)
- [quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)
-
 ## CogVideoXPipeline

 [[autodoc]] CogVideoXPipeline
@@ -91,11 +91,11 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
    "xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"},
    "playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"},
    "upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"},
-    "inpainting": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8-inpainting"},
+    "inpainting": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-inpainting"},
    "inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
    "controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
    "v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
-    "v1": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8"},
+    "v1": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5"},
    "stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
    "stable_cascade_stage_b_lite": {
        "pretrained_model_name_or_path": "stabilityai/stable-cascade",
@@ -545,14 +545,11 @@ def get_1d_rotary_pos_embed(
    assert dim % 2 == 0

    if isinstance(pos, int):
-        pos = torch.arange(pos)
-    if isinstance(pos, np.ndarray):
-        pos = torch.from_numpy(pos)  # type: ignore  # [S]
-
+        pos = np.arange(pos)
    theta = theta * ntk_factor
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor  # [D/2]
-    freqs = freqs.to(pos.device)
-    freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
+    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
+    freqs = torch.outer(t, freqs)  # type: ignore   # [S, D/2]
    if use_real and repeat_interleave_real:
        # flux, hunyuan-dit, cogvideox
        freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float()  # [S, D]
@@ -629,7 +626,7 @@ class FluxPosEmbed(nn.Module):
        n_axes = ids.shape[-1]
        cos_out = []
        sin_out = []
-        pos = ids.squeeze().float()
+        pos = ids.squeeze().float().cpu().numpy()
        is_mps = ids.device.type == "mps"
        freqs_dtype = torch.float32 if is_mps else torch.float64
        for i in range(n_axes):
@@ -417,9 +417,6 @@ class ModelTesterMixin:

    @require_torch_gpu
    def test_set_attn_processor_for_determinism(self):
-        if self.uses_custom_attn_processor:
-            return
-
        torch.use_deterministic_algorithms(False)
        if self.forward_requires_fresh_args:
            model = self.model_class(**self.init_dict)
@@ -32,9 +32,6 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
    # We override the items here because the transformer under consideration is small.
    model_split_percents = [0.7, 0.6, 0.6]

-    # Skip setting testing with default: AttnProcessor
-    uses_custom_attn_processor = True
-
    @property
    def dummy_input(self):
        batch_size = 1
@@ -25,9 +25,6 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
    batch_params = frozenset(["prompt"])

-    # there is no xformers processor for Flux
-    test_xformers_attention = False
-
    def get_dummy_components(self):
        torch.manual_seed(0)
        transformer = FluxTransformer2DModel(
@@ -37,7 +37,6 @@ class StableDiffusion3PAGPipelineFastTests(unittest.TestCase, PipelineTesterMixi
        ]
    )
    batch_params = frozenset(["prompt", "negative_prompt"])
-    test_xformers_attention = False

    def get_dummy_components(self):
        torch.manual_seed(0)
@@ -68,8 +68,6 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            "callback_steps",
        ]
    )
-    # There is not xformers version of the StableAudioPipeline custom attention processor
-    test_xformers_attention = False

    def get_dummy_components(self):
        torch.manual_seed(0)