[#9643][fix] AutoDeploy: fix nano sharding config (#9668)

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-12-03 14:10:25 -05:00 · 2025-12-03 14:10:25 -05:00 · a1964bcbbc
commit a1964bcbbc
parent d9fba85396
2 changed files with 20 additions and 37 deletions
--- a/examples/auto_deploy/nano_v3.yaml
+++ b/examples/auto_deploy/nano_v3.yaml
@ -13,8 +13,27 @@ kv_cache_config:
  enable_block_reuse: false
 transforms:
  detect_sharding:
-    sharding_source: ['factory', 'heuristic']
    sharding_dims: ['ep', 'bmm']
+    manual_config:
+      head_dim: 128
+      tp_plan:
+        # mamba SSM layer
+        "in_proj": "mamba"
+        "out_proj": "rowwise"
+        # attention layer
+        "q_proj": "colwise"
+        "k_proj": "colwise"
+        "v_proj": "colwise"
+        "o_proj": "rowwise"
+        # NOTE: consider not sharding shared experts and/or
+        # latent projections at all, keeping them replicated.
+        # To do so, comment out the corresponding entries.
+        # moe layer: SHARED experts
+        "up_proj": "colwise"
+        "down_proj": "rowwise"
+        # MoLE: latent projections: simple shard
+        "fc1_latent_proj": "gather"
+        "fc2_latent_proj": "gather"
  multi_stream_moe:
    stage: compile
    enabled: true
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs):
 # TODO: figure out how this can be incorporated into the export patch system
 AutoModelForCausalLM.from_config = get_model_from_config_patched

-# _config_from_pretrained_original = AutoConfig.from_pretrained
-# _nemotron_h_base_model_tp_plan = {
-#     # mamba SSM layer
-#     "in_proj": "mamba",
-#     "out_proj": "rowwise",
-#     # attention layer
-#     "q_proj": "colwise",
-#     "k_proj": "colwise",
-#     "v_proj": "colwise",
-#     "o_proj": "rowwise",
-#     # NOTE: consider not sharding shared experts and/or
-#     # latent projections at all, keeping them replicated.
-#     # To do so, comment out the corresponding entries.
-#     # moe layer: SHARED experts
-#     "up_proj": "colwise",
-#     "down_proj": "rowwise",
-#     # MoLE: latent projections: simple shard
-#     "fc1_latent_proj": "gather",
-#     "fc2_latent_proj": "gather",
-# }
-
-
-# def get_config_from_pretrained_patched(*args, **kwargs):
-#     ret = _config_from_pretrained_original(*args, **kwargs)
-#     config = ret[0] if isinstance(ret, tuple) else ret
-#     # heuristic to check if it's a NemotronH MoE Model
-#     model_type = getattr(config, "model_type", None)
-#     num_moe_layers = getattr(config, "layers_block_type", []).count("moe")
-#     if model_type == "nemotron_h" and num_moe_layers > 0:
-#         config.base_model_tp_plan = _nemotron_h_base_model_tp_plan
-#     return (config, *ret[1:]) if isinstance(ret, tuple) else config
-
-
-# # TODO: figure out how this can be incorporated into the export patch system
-# AutoConfig.from_pretrained = get_config_from_pretrained_patched
-
 # TODO: figure out how this can be incorporated into the export patch system
 # Only patch if the module isn't available
 _mamba_ssm_module = "mamba_ssm"