update

2025-04-01 05:16:54 +02:00 · 2025-04-01 05:05:29 +02:00 · 2025-04-01 04:25:52 +02:00 · 2025-03-31 12:37:36 +02:00 · 2025-03-31 11:58:29 +02:00 · 2025-03-24 18:01:06 +01:00
8 changed files with 15 additions and 57 deletions
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne

 Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].

-Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
+Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.

 ```shell
-export HF_HUB_OFFLINE=1
+export HF_HUB_OFFLINE=True
 ```

 For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,16 +179,14 @@ Telemetry is only sent when loading models and pipelines from the Hub,
 and it is not collected if you're loading local files.

 We understand that not everyone wants to share additional information,and we respect your privacy.
-You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
+You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:

 On Linux/MacOS:
-
 ```bash
-export HF_HUB_DISABLE_TELEMETRY=1
+export DISABLE_TELEMETRY=YES
 ```

 On Windows:
-
 ```bash
-set HF_HUB_DISABLE_TELEMETRY=1
+set DISABLE_TELEMETRY=YES
 ```
@@ -1773,7 +1773,7 @@ class SDXLLongPromptWeightingPipeline(
                        f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                        f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                        f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                        f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
+                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
                        " `pipeline.unet` or your `mask_image` or `image` input."
                    )
            elif num_channels_unet != 4:
@@ -1924,22 +1924,7 @@ class SDXLLongPromptWeightingPipeline(
                self.upcast_vae()
                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)

-            # unscale/denormalize the latents
-            # denormalize with the mean and std if available and not None
-            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
-            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
-            if has_latents_mean and has_latents_std:
-                latents_mean = (
-                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
-                )
-                latents_std = (
-                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
-                )
-                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
-            else:
-                latents = latents / self.vae.config.scaling_factor
-
-            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]

            # cast back to fp16 if needed
            if needs_upcasting:
@@ -26,7 +26,6 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
 from ..attention_processor import Attention
-from ..cache_utils import CacheMixin
 from ..embeddings import PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -299,7 +298,7 @@ class LTXVideoTransformerBlock(nn.Module):


@maybe_allow_in_graph
-class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin, CacheMixin):
+class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
    r"""
    A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video).

@@ -24,7 +24,6 @@ from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import FeedForward
 from ..attention_processor import Attention
-from ..cache_utils import CacheMixin
 from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -289,7 +288,7 @@ class WanTransformerBlock(nn.Module):
        return hidden_states


-class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
    r"""
    A Transformer model for video-like data used in the Wan model.

@@ -489,10 +489,6 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
    def num_timesteps(self):
        return self._num_timesteps

-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
    @property
    def attention_kwargs(self):
        return self._attention_kwargs
@@ -626,7 +622,6 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
        self._guidance_scale = guidance_scale
        self._attention_kwargs = attention_kwargs
        self._interrupt = False
-        self._current_timestep = None

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
@@ -711,8 +706,6 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
                if self.interrupt:
                    continue

-                self._current_timestep = t
-
                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                latent_model_input = latent_model_input.to(prompt_embeds.dtype)

@@ -774,10 +774,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
    def num_timesteps(self):
        return self._num_timesteps

-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
    @property
    def attention_kwargs(self):
        return self._attention_kwargs
@@ -937,7 +933,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
        self._guidance_scale = guidance_scale
        self._attention_kwargs = attention_kwargs
        self._interrupt = False
-        self._current_timestep = None

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
@@ -1071,8 +1066,6 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
                if self.interrupt:
                    continue

-                self._current_timestep = t
-
                if image_cond_noise_scale > 0:
                    # Add timestep-dependent noise to the hard-conditioning latents
                    # This helps with motion continuity, especially when conditioned on a single frame
@@ -487,21 +487,19 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
    ) -> torch.Tensor:
        height = height // self.vae_spatial_compression_ratio
        width = width // self.vae_spatial_compression_ratio
-        num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
+        num_frames = (
+            (num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
+        )

        shape = (batch_size, num_channels_latents, num_frames, height, width)
        mask_shape = (batch_size, 1, num_frames, height, width)

        if latents is not None:
-            conditioning_mask = latents.new_zeros(mask_shape)
+            conditioning_mask = latents.new_zeros(shape)
            conditioning_mask[:, :, 0] = 1.0
            conditioning_mask = self._pack_latents(
                conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-            ).squeeze(-1)
-            if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
-                raise ValueError(
-                    f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
-                )
+            )
            return latents.to(device=device, dtype=dtype), conditioning_mask

        if isinstance(generator, list):
@@ -550,10 +548,6 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
    def num_timesteps(self):
        return self._num_timesteps

-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
    @property
    def attention_kwargs(self):
        return self._attention_kwargs
@@ -690,7 +684,6 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
        self._guidance_scale = guidance_scale
        self._attention_kwargs = attention_kwargs
        self._interrupt = False
-        self._current_timestep = None

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
@@ -783,8 +776,6 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
                if self.interrupt:
                    continue

-                self._current_timestep = t
-
                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                latent_model_input = latent_model_input.to(prompt_embeds.dtype)

@@ -315,7 +315,7 @@ class BnB8bitBasicTests(Base8bitTests):
        _ = self.model_fp16.float()

        # Check that this does not throw an error
-        _ = self.model_fp16.to(torch_device)
+        _ = self.model_fp16.cuda()


 class Bnb8bitDeviceTests(Base8bitTests):
Author	SHA1	Message	Date
Dhruv Nair	20d69cb7e0	update	2025-04-01 05:16:54 +02:00
Dhruv Nair	2931139588	update	2025-04-01 05:05:29 +02:00
Dhruv Nair	12ce0d0bca	update	2025-04-01 04:25:52 +02:00
Dhruv Nair	13ac0b832a	update	2025-03-31 12:37:36 +02:00
Dhruv Nair	150ee4db8e	Merge branch 'main' into wan-v2v	2025-03-31 11:58:29 +02:00
Dhruv Nair	a23c9812ba	update	2025-03-24 18:01:06 +01:00
Dhruv Nair	8ddbcad714	update	2025-03-14 12:33:53 +01:00
Dhruv Nair	6d0d68aee3	update	2025-03-14 12:19:38 +01:00
Dhruv Nair	598ca27dca	update	2025-03-14 11:20:57 +01:00
DN6	aa2c37a86a	update	2025-03-13 22:10:42 +05:30
DN6	558de5cba3	update	2025-03-13 22:05:53 +05:30
DN6	89e956b277	update	2025-03-13 21:53:43 +05:30
DN6	515fe28897	update	2025-03-12 07:50:47 +05:30
DN6	09f9acb473	update	2025-03-12 07:50:35 +05:30
DN6	bfdf495507	update	2025-03-11 17:09:46 +05:30
DN6	8639a1735c	update	2025-03-11 17:07:31 +05:30
DN6	c9971ebcfb	update	2025-03-11 16:10:07 +05:30