From ba1037ca4aa2e134e28b98f987495177fee6c97e Mon Sep 17 00:00:00 2001 From: xxi <95731198+xxi-nv@users.noreply.github.com> Date: Tue, 13 Jan 2026 09:21:01 +0800 Subject: [PATCH] [https://nvbugs/5762336][fix] support to parse the keyword modules_to_not_convert of the HF model config" (#10527) Signed-off-by: xxi --- tensorrt_llm/_torch/model_config.py | 34 ++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 459034ec0e..2fff7e47eb 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -316,32 +316,50 @@ class ModelConfig(Generic[TConfig]): quant_config = QuantConfig() layer_quant_config = None + # Read exclude_modules from HF config if present (HF format module names) + hf_exclude_modules = hf_quant_config.get('modules_to_not_convert', None) + # DeepSeek V3 FP8 ckpt if hf_quant_config.get("quant_method") == "fp8" and hf_quant_config.get( "weight_block_size", []): quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES - if moe_backend == 'TRTLLM': - # TODO: This is a hack. Remove after fp8 bmm is integrated. - quant_config.exclude_modules = [ - "*kv_b_proj*", "*k_b_proj*", "*eh_proj" - ] - else: - quant_config.exclude_modules = ["*eh_proj"] block_size = hf_quant_config.get("weight_block_size", []) assert tuple(block_size) == ( 128, 128), "FP8_BLOCK_SCALES only supports block_size=(128,128)" quant_config.group_size = block_size[0] + + # Set default exclude_modules for FP8_BLOCK_SCALES + if moe_backend == 'TRTLLM': + default_exclude = ["*kv_b_proj*", "*k_b_proj*", "*eh_proj"] + else: + default_exclude = ["*eh_proj"] + + # Merge HF config's modules_to_not_convert with default exclude_modules + if hf_exclude_modules is not None: + quant_config.exclude_modules = list( + set(hf_exclude_modules + default_exclude)) + else: + quant_config.exclude_modules = default_exclude # MXFP4 checkpoints. elif hf_quant_config.get("quant_method") == "mxfp4": quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo( moe_backend) quant_config.group_size = 32 - quant_config.exclude_modules = [ + + # Default exclude_modules for MXFP4 (TRTLLM internal format) + default_exclude = [ 'block.*.attn.out', 'block.*.mlp.gate', 'block.*.attn.qkv', 'embedding', 'unembedding' ] + # Merge HF config's modules_to_not_convert with default exclude_modules + if hf_exclude_modules is not None: + quant_config.exclude_modules = list( + set(hf_exclude_modules + default_exclude)) + else: + quant_config.exclude_modules = default_exclude + return quant_config, layer_quant_config @staticmethod