let's go Flux2 🚀 (#12711)

* add vae

* Initial commit for Flux 2 Transformer implementation

* add pipeline part

* small edits to the pipeline and conversion

* update conversion script

* fix

* up up

* finish pipeline

* Remove Flux IP Adapter logic for now

* Remove deprecated 3D id logic

* Remove ControlNet logic for now

* Add link to ViT-22B paper as reference for parallel transformer blocks such as the Flux 2 single stream block

* update pipeline

* Don't use biases for input projs and output AdaNorm

* up

* Remove bias for double stream block text QKV projections

* Add script to convert Flux 2 transformer to diffusers

* make style and make quality

* fix a few things.

* allow sft files to go.

* fix image processor

* fix batch

* style a bit

* Fix some bugs in Flux 2 transformer implementation

* Fix dummy input preparation and fix some test bugs

* fix dtype casting in timestep guidance module.

* resolve conflicts.,

* remove ip adapter stuff.

* Fix Flux 2 transformer consistency test

* Fix bug in Flux2TransformerBlock (double stream block)

* Get remaining Flux 2 transformer tests passing

* make style; make quality; make fix-copies

* remove stuff.

* fix type annotaton.

* remove unneeded stuff from tests

* tests

* up

* up

* add sf support

* Remove unused IP Adapter and ControlNet logic from transformer (#9)

* copied from

* Apply suggestions from code review

Co-authored-by: YiYi Xu <yixu310@gmail.com>
Co-authored-by: apolinário <joaopaulo.passos@gmail.com>

* up

* up

* up

* up

* up

* Refactor Flux2Attention into separate classes for double stream and single stream attention

* Add _supports_qkv_fusion to AttentionModuleMixin to allow subclasses to disable QKV fusion

* Have Flux2ParallelSelfAttention inherit from AttentionModuleMixin with _supports_qkv_fusion=False

* Log debug message when calling fuse_projections on a AttentionModuleMixin subclass that does not support QKV fusion

* Address review comments

* Update src/diffusers/pipelines/flux2/pipeline_flux2.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* up

* Remove maybe_allow_in_graph decorators for Flux 2 transformer blocks (#12)

* up

* support ostris loras. (#13)

* up

* update schdule

* up

* up (#17)

* add training scripts (#16)

* add training scripts

Co-authored-by: Linoy Tsaban <linoytsaban@gmail.com>

* model cpu offload in validation.

* add flux.2 readme

* add img2img and tests

* cpu offload in log validation

* Apply suggestions from code review

* fix

* up

* fixes

* remove i2i training tests for now.

---------

Co-authored-by: Linoy Tsaban <linoytsaban@gmail.com>
Co-authored-by: linoytsaban <linoy@huggingface.co>

* up

---------

Co-authored-by: yiyixuxu <yixu310@gmail.com>
Co-authored-by: Daniel Gu <dgu8957@gmail.com>
Co-authored-by: yiyi@huggingface.co <yiyi@ip-10-53-87-203.ec2.internal>
Co-authored-by: dg845 <58458699+dg845@users.noreply.github.com>
Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
Co-authored-by: apolinário <joaopaulo.passos@gmail.com>
Co-authored-by: yiyi@huggingface.co <yiyi@ip-26-0-160-103.ec2.internal>
Co-authored-by: Linoy Tsaban <linoytsaban@gmail.com>
Co-authored-by: linoytsaban <linoy@huggingface.co>
This commit is contained in:
Sayak Paul
2025-11-25 21:49:04 +05:30
committed by GitHub
parent 4088e8a851
commit 5ffb73d4ae
34 changed files with 8427 additions and 4 deletions
+127
View File
@@ -0,0 +1,127 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import unittest
import torch
from transformers import AutoProcessor, Mistral3ForConditionalGeneration
from diffusers import AutoencoderKLFlux2, FlowMatchEulerDiscreteScheduler, Flux2Pipeline, Flux2Transformer2DModel
from ..testing_utils import floats_tensor, require_peft_backend
sys.path.append(".")
from .utils import PeftLoraLoaderMixinTests # noqa: E402
@require_peft_backend
class Flux2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
pipeline_class = Flux2Pipeline
scheduler_cls = FlowMatchEulerDiscreteScheduler
scheduler_kwargs = {}
transformer_kwargs = {
"patch_size": 1,
"in_channels": 4,
"num_layers": 1,
"num_single_layers": 1,
"attention_head_dim": 16,
"num_attention_heads": 2,
"joint_attention_dim": 16,
"timestep_guidance_channels": 256,
"axes_dims_rope": [4, 4, 4, 4],
}
transformer_cls = Flux2Transformer2DModel
vae_kwargs = {
"sample_size": 32,
"in_channels": 3,
"out_channels": 3,
"down_block_types": ("DownEncoderBlock2D",),
"up_block_types": ("UpDecoderBlock2D",),
"block_out_channels": (4,),
"layers_per_block": 1,
"latent_channels": 1,
"norm_num_groups": 1,
"use_quant_conv": False,
"use_post_quant_conv": False,
}
vae_cls = AutoencoderKLFlux2
tokenizer_cls, tokenizer_id = AutoProcessor, "hf-internal-testing/tiny-mistral3-diffusers"
text_encoder_cls, text_encoder_id = Mistral3ForConditionalGeneration, "hf-internal-testing/tiny-mistral3-diffusers"
denoiser_target_modules = ["to_qkv_mlp_proj", "to_k"]
@property
def output_shape(self):
return (1, 8, 8, 3)
def get_dummy_inputs(self, with_generator=True):
batch_size = 1
sequence_length = 10
num_channels = 4
sizes = (32, 32)
generator = torch.manual_seed(0)
noise = floats_tensor((batch_size, num_channels) + sizes)
input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
pipeline_inputs = {
"prompt": "a dog is dancing",
"num_inference_steps": 2,
"guidance_scale": 5.0,
"height": 8,
"width": 8,
"max_sequence_length": 8,
"output_type": "np",
"text_encoder_out_layers": (1,),
}
if with_generator:
pipeline_inputs.update({"generator": generator})
return noise, input_ids, pipeline_inputs
@unittest.skip("Not supported in Flux2.")
def test_simple_inference_with_text_denoiser_block_scale(self):
pass
@unittest.skip("Not supported in Flux2.")
def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
pass
@unittest.skip("Not supported in Flux2.")
def test_modify_padding_mode(self):
pass
@unittest.skip("Text encoder LoRA is not supported in Flux2.")
def test_simple_inference_with_partial_text_lora(self):
pass
@unittest.skip("Text encoder LoRA is not supported in Flux2.")
def test_simple_inference_with_text_lora(self):
pass
@unittest.skip("Text encoder LoRA is not supported in Flux2.")
def test_simple_inference_with_text_lora_and_scale(self):
pass
@unittest.skip("Text encoder LoRA is not supported in Flux2.")
def test_simple_inference_with_text_lora_fused(self):
pass
@unittest.skip("Text encoder LoRA is not supported in Flux2.")
def test_simple_inference_with_text_lora_save_load(self):
pass
@@ -0,0 +1,162 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import torch
from diffusers import Flux2Transformer2DModel, attention_backend
from ...testing_utils import enable_full_determinism, torch_device
from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin
enable_full_determinism()
class Flux2TransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = Flux2Transformer2DModel
main_input_name = "hidden_states"
# We override the items here because the transformer under consideration is small.
model_split_percents = [0.7, 0.6, 0.6]
# Skip setting testing with default: AttnProcessor
uses_custom_attn_processor = True
@property
def dummy_input(self):
return self.prepare_dummy_input()
@property
def input_shape(self):
return (16, 4)
@property
def output_shape(self):
return (16, 4)
def prepare_dummy_input(self, height=4, width=4):
batch_size = 1
num_latent_channels = 4
sequence_length = 48
embedding_dim = 32
hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
t_coords = torch.arange(1)
h_coords = torch.arange(height)
w_coords = torch.arange(width)
l_coords = torch.arange(1)
image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords) # [height * width, 4]
image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
text_t_coords = torch.arange(1)
text_h_coords = torch.arange(1)
text_w_coords = torch.arange(1)
text_l_coords = torch.arange(sequence_length)
text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
return {
"hidden_states": hidden_states,
"encoder_hidden_states": encoder_hidden_states,
"img_ids": image_ids,
"txt_ids": text_ids,
"timestep": timestep,
"guidance": guidance,
}
def prepare_init_args_and_inputs_for_common(self):
init_dict = {
"patch_size": 1,
"in_channels": 4,
"num_layers": 1,
"num_single_layers": 1,
"attention_head_dim": 16,
"num_attention_heads": 2,
"joint_attention_dim": 32,
"timestep_guidance_channels": 256, # Hardcoded in original code
"axes_dims_rope": [4, 4, 4, 4],
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
# TODO (Daniel, Sayak): We can remove this test.
def test_flux2_consistency(self, seed=0):
torch.manual_seed(seed)
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
torch.manual_seed(seed)
model = self.model_class(**init_dict)
# state_dict = model.state_dict()
# for key, param in state_dict.items():
# print(f"{key} | {param.shape}")
# torch.save(state_dict, "/raid/daniel_gu/test_flux2_params/diffusers.pt")
model.to(torch_device)
model.eval()
with attention_backend("native"):
with torch.no_grad():
output = model(**inputs_dict)
if isinstance(output, dict):
output = output.to_tuple()[0]
self.assertIsNotNone(output)
# input & output have to have the same shape
input_tensor = inputs_dict[self.main_input_name]
expected_shape = input_tensor.shape
self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
# Check against expected slice
# fmt: off
expected_slice = torch.tensor([-0.3662, 0.4844, 0.6334, -0.3497, 0.2162, 0.0188, 0.0521, -0.2061, -0.2041, -0.0342, -0.7107, 0.4797, -0.3280, 0.7059, -0.0849, 0.4416])
# fmt: on
flat_output = output.cpu().flatten()
generated_slice = torch.cat([flat_output[:8], flat_output[-8:]])
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-4))
def test_gradient_checkpointing_is_applied(self):
expected_set = {"Flux2Transformer2DModel"}
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
class Flux2TransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
model_class = Flux2Transformer2DModel
different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
def prepare_init_args_and_inputs_for_common(self):
return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
def prepare_dummy_input(self, height, width):
return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
class Flux2TransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
model_class = Flux2Transformer2DModel
different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
def prepare_init_args_and_inputs_for_common(self):
return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
def prepare_dummy_input(self, height, width):
return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
View File
@@ -0,0 +1,190 @@
import unittest
import numpy as np
import torch
from transformers import AutoProcessor, Mistral3Config, Mistral3ForConditionalGeneration
from diffusers import (
AutoencoderKLFlux2,
FlowMatchEulerDiscreteScheduler,
Flux2Pipeline,
Flux2Transformer2DModel,
)
from ...testing_utils import (
torch_device,
)
from ..test_pipelines_common import (
PipelineTesterMixin,
check_qkv_fused_layers_exist,
)
class Flux2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = Flux2Pipeline
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds"])
batch_params = frozenset(["prompt"])
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
supports_dduf = False
def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
torch.manual_seed(0)
transformer = Flux2Transformer2DModel(
patch_size=1,
in_channels=4,
num_layers=num_layers,
num_single_layers=num_single_layers,
attention_head_dim=16,
num_attention_heads=2,
joint_attention_dim=16,
timestep_guidance_channels=256, # Hardcoded in original code
axes_dims_rope=[4, 4, 4, 4],
)
config = Mistral3Config(
text_config={
"model_type": "mistral",
"vocab_size": 32000,
"hidden_size": 16,
"intermediate_size": 37,
"max_position_embeddings": 512,
"num_attention_heads": 4,
"num_hidden_layers": 1,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-05,
"rope_theta": 1000000000.0,
"sliding_window": None,
"bos_token_id": 2,
"eos_token_id": 3,
"pad_token_id": 4,
},
vision_config={
"model_type": "pixtral",
"hidden_size": 16,
"num_hidden_layers": 1,
"num_attention_heads": 4,
"intermediate_size": 37,
"image_size": 30,
"patch_size": 6,
"num_channels": 3,
},
bos_token_id=2,
eos_token_id=3,
pad_token_id=4,
model_dtype="mistral3",
image_seq_length=4,
vision_feature_layer=-1,
image_token_index=1,
)
torch.manual_seed(0)
text_encoder = Mistral3ForConditionalGeneration(config)
tokenizer = AutoProcessor.from_pretrained(
"hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor"
)
torch.manual_seed(0)
vae = AutoencoderKLFlux2(
sample_size=32,
in_channels=3,
out_channels=3,
down_block_types=("DownEncoderBlock2D",),
up_block_types=("UpDecoderBlock2D",),
block_out_channels=(4,),
layers_per_block=1,
latent_channels=1,
norm_num_groups=1,
use_quant_conv=False,
use_post_quant_conv=False,
)
scheduler = FlowMatchEulerDiscreteScheduler()
return {
"scheduler": scheduler,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"transformer": transformer,
"vae": vae,
}
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device="cpu").manual_seed(seed)
inputs = {
"prompt": "a dog is dancing",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 5.0,
"height": 8,
"width": 8,
"max_sequence_length": 8,
"output_type": "np",
"text_encoder_out_layers": (1,),
}
return inputs
def test_fused_qkv_projections(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
original_image_slice = image[0, -3:, -3:, -1]
# TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
# to the pipeline level.
pipe.transformer.fuse_qkv_projections()
self.assertTrue(
check_qkv_fused_layers_exist(pipe.transformer, ["to_qkv"]),
("Something wrong with the fused attention layers. Expected all the attention projections to be fused."),
)
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
image_slice_fused = image[0, -3:, -3:, -1]
pipe.transformer.unfuse_qkv_projections()
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
image_slice_disabled = image[0, -3:, -3:, -1]
self.assertTrue(
np.allclose(original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3),
("Fusion of QKV projections shouldn't affect the outputs."),
)
self.assertTrue(
np.allclose(image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3),
("Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."),
)
self.assertTrue(
np.allclose(original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2),
("Original outputs should match when fused QKV projections are disabled."),
)
def test_flux_image_output_shape(self):
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
height_width_pairs = [(32, 32), (72, 57)]
for height, width in height_width_pairs:
expected_height = height - height % (pipe.vae_scale_factor * 2)
expected_width = width - width % (pipe.vae_scale_factor * 2)
inputs.update({"height": height, "width": width})
image = pipe(**inputs).images[0]
output_height, output_width, _ = image.shape
self.assertEqual(
(output_height, output_width),
(expected_height, expected_width),
f"Output shape {image.shape} does not match expected shape {(expected_height, expected_width)}",
)
+1 -1
View File
@@ -103,7 +103,7 @@ def check_qkv_fusion_processors_exist(model):
def check_qkv_fused_layers_exist(model, layer_names):
is_fused_submodules = []
for submodule in model.modules():
if not isinstance(submodule, AttentionModuleMixin):
if not isinstance(submodule, AttentionModuleMixin) or not submodule._supports_qkv_fusion:
continue
is_fused_attribute_set = submodule.fused_projections
is_fused_layer = True