From 64ff5cac520650813d8926788ccf7712ba3361de Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Mon, 19 Jan 2026 03:38:00 -0600 Subject: [PATCH] [None][chore] docs: clarify LoRA is not supported with --use_fp8_rowwise in Fp8RowwiseAttention (see #2603) (#10320) Signed-off-by: Samaresh Kumar Singh Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Co-authored-by: Kanghwan <861393+karljang@users.noreply.github.com> Co-authored-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- tensorrt_llm/quantization/layers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py index 7aa8e80800..28f4cbf2e8 100644 --- a/tensorrt_llm/quantization/layers.py +++ b/tensorrt_llm/quantization/layers.py @@ -1967,7 +1967,11 @@ class Fp8RowwiseAttention(Module): lora_layer_params=None, all_reduce_params: Optional[AllReduceParams] = None, ): - assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now" + assert lora_layer_params is None, ( + f"LoRA is not supported by {self.__class__.__name__} (e.g., --use_fp8_rowwise). " + "If you need LoRA support, please use a non-quantized (e.g., bf16) attention implementation. " + "See https://github.com/NVIDIA/TensorRT-LLM/issues/2603 for details." + ) qkv = self.qkv(hidden_states) alibi_slopes = None