Compare commits

..

8 Commits

Author SHA1 Message Date
Dhruv Nair ca5cfbd37e update 2025-04-02 20:33:17 +02:00
DN6 5ac65c4513 update 2025-04-01 23:00:21 +05:30
DN6 8c2b2cdc52 update 2025-04-01 22:20:28 +05:30
Dhruv Nair df1d7b01f1 [WIP] Add Wan Video2Video (#11053)
* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
2025-04-01 17:22:11 +05:30
Fanli Lin 5a6edac087 [tests] no hard-coded cuda (#11186)
no cuda only
2025-04-01 12:14:31 +01:00
kakukakujirori e8fc8b1f81 Bug fix in LTXImageToVideoPipeline.prepare_latents() when latents is already set (#10918)
* Bug fix in ltx

* Assume packed latents.

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
2025-03-31 12:15:43 -10:00
hlky d6f4774c1c Add latents_mean and latents_std to SDXLLongPromptWeightingPipeline (#11034) 2025-03-31 11:32:29 -10:00
Mark eb50defff2 [Docs] Fix environment variables in installation.md (#11179) 2025-03-31 09:15:25 -07:00
8 changed files with 57 additions and 15 deletions
+7 -5
View File
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne
Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
```shell
export HF_HUB_OFFLINE=True
export HF_HUB_OFFLINE=1
```
For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,14 +179,16 @@ Telemetry is only sent when loading models and pipelines from the Hub,
and it is not collected if you're loading local files.
We understand that not everyone wants to share additional information,and we respect your privacy.
You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
On Linux/MacOS:
```bash
export DISABLE_TELEMETRY=YES
export HF_HUB_DISABLE_TELEMETRY=1
```
On Windows:
```bash
set DISABLE_TELEMETRY=YES
set HF_HUB_DISABLE_TELEMETRY=1
```
+17 -2
View File
@@ -1773,7 +1773,7 @@ class SDXLLongPromptWeightingPipeline(
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
" `pipeline.unet` or your `mask_image` or `image` input."
)
elif num_channels_unet != 4:
@@ -1924,7 +1924,22 @@ class SDXLLongPromptWeightingPipeline(
self.upcast_vae()
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
# unscale/denormalize the latents
# denormalize with the mean and std if available and not None
has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
if has_latents_mean and has_latents_std:
latents_mean = (
torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
)
latents_std = (
torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
)
latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
else:
latents = latents / self.vae.config.scaling_factor
image = self.vae.decode(latents, return_dict=False)[0]
# cast back to fp16 if needed
if needs_upcasting:
@@ -26,6 +26,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_
from ...utils.torch_utils import maybe_allow_in_graph
from ..attention import FeedForward
from ..attention_processor import Attention
from ..cache_utils import CacheMixin
from ..embeddings import PixArtAlphaTextProjection
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -298,7 +299,7 @@ class LTXVideoTransformerBlock(nn.Module):
@maybe_allow_in_graph
class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin, CacheMixin):
r"""
A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video).
@@ -24,6 +24,7 @@ from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
from ..attention import FeedForward
from ..attention_processor import Attention
from ..cache_utils import CacheMixin
from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -288,7 +289,7 @@ class WanTransformerBlock(nn.Module):
return hidden_states
class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
r"""
A Transformer model for video-like data used in the Wan model.
@@ -489,6 +489,10 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
def num_timesteps(self):
return self._num_timesteps
@property
def current_timestep(self):
return self._current_timestep
@property
def attention_kwargs(self):
return self._attention_kwargs
@@ -622,6 +626,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs
self._interrupt = False
self._current_timestep = None
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -706,6 +711,8 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
if self.interrupt:
continue
self._current_timestep = t
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
latent_model_input = latent_model_input.to(prompt_embeds.dtype)
@@ -774,6 +774,10 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
def num_timesteps(self):
return self._num_timesteps
@property
def current_timestep(self):
return self._current_timestep
@property
def attention_kwargs(self):
return self._attention_kwargs
@@ -933,6 +937,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs
self._interrupt = False
self._current_timestep = None
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -1066,6 +1071,8 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
if self.interrupt:
continue
self._current_timestep = t
if image_cond_noise_scale > 0:
# Add timestep-dependent noise to the hard-conditioning latents
# This helps with motion continuity, especially when conditioned on a single frame
@@ -487,19 +487,21 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
) -> torch.Tensor:
height = height // self.vae_spatial_compression_ratio
width = width // self.vae_spatial_compression_ratio
num_frames = (
(num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
)
num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
shape = (batch_size, num_channels_latents, num_frames, height, width)
mask_shape = (batch_size, 1, num_frames, height, width)
if latents is not None:
conditioning_mask = latents.new_zeros(shape)
conditioning_mask = latents.new_zeros(mask_shape)
conditioning_mask[:, :, 0] = 1.0
conditioning_mask = self._pack_latents(
conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
)
).squeeze(-1)
if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
raise ValueError(
f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
)
return latents.to(device=device, dtype=dtype), conditioning_mask
if isinstance(generator, list):
@@ -548,6 +550,10 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
def num_timesteps(self):
return self._num_timesteps
@property
def current_timestep(self):
return self._current_timestep
@property
def attention_kwargs(self):
return self._attention_kwargs
@@ -684,6 +690,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs
self._interrupt = False
self._current_timestep = None
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -776,6 +783,8 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
if self.interrupt:
continue
self._current_timestep = t
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
latent_model_input = latent_model_input.to(prompt_embeds.dtype)
+1 -1
View File
@@ -315,7 +315,7 @@ class BnB8bitBasicTests(Base8bitTests):
_ = self.model_fp16.float()
# Check that this does not throw an error
_ = self.model_fp16.cuda()
_ = self.model_fp16.to(torch_device)
class Bnb8bitDeviceTests(Base8bitTests):