diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index dab230ec86..a6c32766af 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -778,21 +778,32 @@ class LlamaDecoderLayer(DecoderLayer): ) # Fully Connected if self.PRE_MLP_FUSION: + has_lora = bool(kwargs.get('lora_params')) + if self.is_nvfp4 or self.is_fp8_quant: - scale = self.mlp.gate_up_proj.input_scale + # WAR: Skip FP8/NVFP4 quantization when LoRA is active + # since LoRA grouped_gemm does not support FP8 yet + # see: cpp/tensorrt_llm/thop/loraOp.cpp::lora_grouped_gemm + if has_lora: + scale = None # To prevent quantization + fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM # Use non-quantizing fusion + else: + scale = self.mlp.gate_up_proj.input_scale + fusion_op = self.pre_mlp_fusion_op else: scale = None + fusion_op = self.pre_mlp_fusion_op all_reduce_output = self.all_reduce( hidden_states, all_reduce_params=AllReduceParams( - fusion_op=self.pre_mlp_fusion_op, + fusion_op=fusion_op, residual=residual, norm_weight=self.post_attention_layernorm.weight, scale=scale, eps=self.post_attention_layernorm.variance_epsilon, )) - if self.is_nvfp4: + if fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4: act_fp4, act_sf, residual = all_reduce_output hidden_states = Fp4QuantizedTensor(act_fp4, act_sf) else: @@ -841,24 +852,30 @@ class LlamaDecoderLayer(DecoderLayer): # The next layernorm exists but it could be the last decoder layer. # Adjust the scale and fusion pattern. + has_lora = bool(kwargs.get('lora_params')) + + # WAR: Skip FP8/NVFP4 quantization when LoRA is active + # since LoRA grouped_gemm does not support FP8 yet if not (self.next_attn is not None and (self.is_nvfp4 or self.is_fp8_quant)) \ - or not hasattr(self.next_attn.qkv_proj, 'input_scale'): + or not hasattr(self.next_attn.qkv_proj, 'input_scale') \ + or has_lora: scale = None - self.post_mlp_fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM + post_fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM else: scale = self.next_attn.qkv_proj.input_scale + post_fusion_op = self.post_mlp_fusion_op all_reduce_output = self.all_reduce( hidden_states, all_reduce_params=AllReduceParams( - fusion_op=self.post_mlp_fusion_op, + fusion_op=post_fusion_op, residual=residual, norm_weight=self.next_layer_layernorm.weight, scale=scale, eps=self.next_layer_layernorm.variance_epsilon, )) - if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4: + if post_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4: act_fp4, act_sf, residual = all_reduce_output hidden_states = Fp4QuantizedTensor(act_fp4, act_sf) else: diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 5b1ec88351..3f571d9d7e 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -433,6 +433,7 @@ class Attention(nn.Module): output: Optional[torch.Tensor] = None, output_sf: Optional[torch.Tensor] = None, attention_sinks: Optional[torch.Tensor] = None, + has_lora: bool = False, ): num_tokens = attn_metadata.num_tokens @@ -446,7 +447,8 @@ class Attention(nn.Module): out_scale_sf = None # Don't set out_scale if o_proj has pre_quant_scale - this prevents FP8/FP4 output # and keeps attention output in BF16 for better precision when applying pre_quant_scale - if self._use_quantize_output(): + # Also don't set out_scale if LoRA is active - LoRA grouped_gemm doesn't support FP8 + if self._use_quantize_output() and not has_lora: out_scale = self.o_proj.inv_input_scale out_scale_sf = self.o_proj.input_scale @@ -499,6 +501,7 @@ class Attention(nn.Module): attention_mask_data: Optional[torch.Tensor], mrope_config: Optional[dict], attention_sinks: Optional[torch.Tensor] = None, + has_lora: bool = False, ): mrope_rotary_cos_sin = None mrope_position_deltas = None @@ -544,7 +547,8 @@ class Attention(nn.Module): mrope_position_deltas, attention_window_size, attention_mask_data, - attention_sinks=attention_sinks) + attention_sinks=attention_sinks, + has_lora=has_lora) if output_sf is not None: output = Fp4QuantizedTensor(output, output_sf) @@ -619,7 +623,8 @@ class Attention(nn.Module): attention_window_size, attention_mask_data, mrope_config=mrope_config, - attention_sinks=attention_sinks) + attention_sinks=attention_sinks, + has_lora=bool(lora_params)) if self.attn_output_gate: gate = torch.sigmoid(gate) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index e5ee03da65..b7ffce9027 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -30,8 +30,8 @@ from .test_llm import (_test_llm_capture_request_error, get_model_path, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness) -from utils.util import (force_ampere, similar, skip_fp8_pre_ada, - skip_gpu_memory_less_than_40gb, +from utils.util import (force_ampere, similar, similarity_score, + skip_fp8_pre_ada, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb, skip_ray) from utils.llm_data import llm_models_root @@ -629,6 +629,42 @@ def test_llama_3_1_8b_fp8_with_bf16_lora(cuda_graph_config) -> None: assert similar(output.outputs[0].text, reference) +@skip_ray # https://nvbugs/5682551 +@skip_gpu_memory_less_than_80gb +def test_llama_3_3_70b_fp8_with_squad_lora_tp2() -> None: + skip_fp8_pre_ada(use_fp8=True) + + model_dir = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8" + lora_dir = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8-lora-adapter_NIM_r8" + + prompt = "What is the capital of the United States?" + expected_output = " Washington, D.C.\nWhat is the capital of the United States? Washington, D.C." + + lora_config = LoraConfig(lora_dir=[lora_dir], + max_lora_rank=8, + max_loras=2, + max_cpu_loras=2) + lora_req = LoRARequest("squad-lora", 0, lora_dir) + + llm = LLM(model_dir, + tensor_parallel_size=2, + lora_config=lora_config, + cuda_graph_config=None) + + try: + output = llm.generate(prompt, + SamplingParams(max_tokens=50, temperature=0.0), + lora_request=[lora_req]) + generated_text = output.outputs[0].text + print(f"Generated output: {repr(generated_text)}") + + similarity = similarity_score(generated_text, expected_output) + assert similar(generated_text, expected_output, threshold=0.8), \ + f"Output similarity too low (similarity={similarity:.2%})!\nExpected: {repr(expected_output)}\nGot: {repr(generated_text)}" + finally: + llm.shutdown() + + @skip_gpu_memory_less_than_80gb @pytest.mark.part2 @test_lora_with_and_without_cuda_graph