mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
This commit is contained in:
parent
d9fba85396
commit
a1964bcbbc
@ -13,8 +13,27 @@ kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
transforms:
|
||||
detect_sharding:
|
||||
sharding_source: ['factory', 'heuristic']
|
||||
sharding_dims: ['ep', 'bmm']
|
||||
manual_config:
|
||||
head_dim: 128
|
||||
tp_plan:
|
||||
# mamba SSM layer
|
||||
"in_proj": "mamba"
|
||||
"out_proj": "rowwise"
|
||||
# attention layer
|
||||
"q_proj": "colwise"
|
||||
"k_proj": "colwise"
|
||||
"v_proj": "colwise"
|
||||
"o_proj": "rowwise"
|
||||
# NOTE: consider not sharding shared experts and/or
|
||||
# latent projections at all, keeping them replicated.
|
||||
# To do so, comment out the corresponding entries.
|
||||
# moe layer: SHARED experts
|
||||
"up_proj": "colwise"
|
||||
"down_proj": "rowwise"
|
||||
# MoLE: latent projections: simple shard
|
||||
"fc1_latent_proj": "gather"
|
||||
"fc2_latent_proj": "gather"
|
||||
multi_stream_moe:
|
||||
stage: compile
|
||||
enabled: true
|
||||
|
||||
@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs):
|
||||
# TODO: figure out how this can be incorporated into the export patch system
|
||||
AutoModelForCausalLM.from_config = get_model_from_config_patched
|
||||
|
||||
# _config_from_pretrained_original = AutoConfig.from_pretrained
|
||||
# _nemotron_h_base_model_tp_plan = {
|
||||
# # mamba SSM layer
|
||||
# "in_proj": "mamba",
|
||||
# "out_proj": "rowwise",
|
||||
# # attention layer
|
||||
# "q_proj": "colwise",
|
||||
# "k_proj": "colwise",
|
||||
# "v_proj": "colwise",
|
||||
# "o_proj": "rowwise",
|
||||
# # NOTE: consider not sharding shared experts and/or
|
||||
# # latent projections at all, keeping them replicated.
|
||||
# # To do so, comment out the corresponding entries.
|
||||
# # moe layer: SHARED experts
|
||||
# "up_proj": "colwise",
|
||||
# "down_proj": "rowwise",
|
||||
# # MoLE: latent projections: simple shard
|
||||
# "fc1_latent_proj": "gather",
|
||||
# "fc2_latent_proj": "gather",
|
||||
# }
|
||||
|
||||
|
||||
# def get_config_from_pretrained_patched(*args, **kwargs):
|
||||
# ret = _config_from_pretrained_original(*args, **kwargs)
|
||||
# config = ret[0] if isinstance(ret, tuple) else ret
|
||||
# # heuristic to check if it's a NemotronH MoE Model
|
||||
# model_type = getattr(config, "model_type", None)
|
||||
# num_moe_layers = getattr(config, "layers_block_type", []).count("moe")
|
||||
# if model_type == "nemotron_h" and num_moe_layers > 0:
|
||||
# config.base_model_tp_plan = _nemotron_h_base_model_tp_plan
|
||||
# return (config, *ret[1:]) if isinstance(ret, tuple) else config
|
||||
|
||||
|
||||
# # TODO: figure out how this can be incorporated into the export patch system
|
||||
# AutoConfig.from_pretrained = get_config_from_pretrained_patched
|
||||
|
||||
# TODO: figure out how this can be incorporated into the export patch system
|
||||
# Only patch if the module isn't available
|
||||
_mamba_ssm_module = "mamba_ssm"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user