[#9643][fix] AutoDeploy: fix nano sharding config (#9668)

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
This commit is contained in:
Lucas Liebenwein 2025-12-03 14:10:25 -05:00 committed by GitHub
parent d9fba85396
commit a1964bcbbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 37 deletions

View File

@ -13,8 +13,27 @@ kv_cache_config:
enable_block_reuse: false
transforms:
detect_sharding:
sharding_source: ['factory', 'heuristic']
sharding_dims: ['ep', 'bmm']
manual_config:
head_dim: 128
tp_plan:
# mamba SSM layer
"in_proj": "mamba"
"out_proj": "rowwise"
# attention layer
"q_proj": "colwise"
"k_proj": "colwise"
"v_proj": "colwise"
"o_proj": "rowwise"
# NOTE: consider not sharding shared experts and/or
# latent projections at all, keeping them replicated.
# To do so, comment out the corresponding entries.
# moe layer: SHARED experts
"up_proj": "colwise"
"down_proj": "rowwise"
# MoLE: latent projections: simple shard
"fc1_latent_proj": "gather"
"fc2_latent_proj": "gather"
multi_stream_moe:
stage: compile
enabled: true

View File

@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs):
# TODO: figure out how this can be incorporated into the export patch system
AutoModelForCausalLM.from_config = get_model_from_config_patched
# _config_from_pretrained_original = AutoConfig.from_pretrained
# _nemotron_h_base_model_tp_plan = {
# # mamba SSM layer
# "in_proj": "mamba",
# "out_proj": "rowwise",
# # attention layer
# "q_proj": "colwise",
# "k_proj": "colwise",
# "v_proj": "colwise",
# "o_proj": "rowwise",
# # NOTE: consider not sharding shared experts and/or
# # latent projections at all, keeping them replicated.
# # To do so, comment out the corresponding entries.
# # moe layer: SHARED experts
# "up_proj": "colwise",
# "down_proj": "rowwise",
# # MoLE: latent projections: simple shard
# "fc1_latent_proj": "gather",
# "fc2_latent_proj": "gather",
# }
# def get_config_from_pretrained_patched(*args, **kwargs):
# ret = _config_from_pretrained_original(*args, **kwargs)
# config = ret[0] if isinstance(ret, tuple) else ret
# # heuristic to check if it's a NemotronH MoE Model
# model_type = getattr(config, "model_type", None)
# num_moe_layers = getattr(config, "layers_block_type", []).count("moe")
# if model_type == "nemotron_h" and num_moe_layers > 0:
# config.base_model_tp_plan = _nemotron_h_base_model_tp_plan
# return (config, *ret[1:]) if isinstance(ret, tuple) else config
# # TODO: figure out how this can be incorporated into the export patch system
# AutoConfig.from_pretrained = get_config_from_pretrained_patched
# TODO: figure out how this can be incorporated into the export patch system
# Only patch if the module isn't available
_mamba_ssm_module = "mamba_ssm"