From 35010e807360e2ff86f618081089e0e85d48ab48 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 29 Apr 2025 18:51:43 +0300 Subject: [PATCH] Support NemotronH FP8 Quantization (1) match quant exclude modules names to TRTLLM names (2) No need for any special weight loading for quantization scales weights (#3891) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_nemotron_h.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py index 9fc148dbfa..4aebb419cd 100644 --- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py @@ -232,6 +232,13 @@ class NemotronHForCausalLM(DecoderModelForCausalLM[NemotronHModel, ): if not model_config.mapping.tp_size in [1, 2, 4, 8]: raise ValueError("TP has to be either 1, 2, 4 or 8") + + if model_config.quant_config.exclude_modules is not None: + model_config.quant_config.exclude_modules = [ + k.replace('model.layers.backbone', 'model') + for k in model_config.quant_config.exclude_modules + ] + super().__init__( NemotronHModel(model_config), config=model_config, @@ -263,7 +270,9 @@ class NemotronHForCausalLM(DecoderModelForCausalLM[NemotronHModel, if "A_log" in key: key = key.replace("A_log", "A") - if "A" in key: + if "_scale" in key and weights[name].dim() == 0: + new_weights[key] = weights[name] + elif "A" in key: w = split(weights[name], tp_size, tp_rank) w = w.to(torch.float32) w = -torch.exp(w)