mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
[None][chore] added AutoDeploy nano_v3_scale.yaml (#10845)
Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
This commit is contained in:
parent
219195688c
commit
31314b9fed
51
examples/auto_deploy/nano_v3_multi_device.yaml
Normal file
51
examples/auto_deploy/nano_v3_multi_device.yaml
Normal file
@ -0,0 +1,51 @@
|
||||
runtime: trtllm
|
||||
compile_backend: torch-cudagraph
|
||||
max_batch_size: 384
|
||||
max_seq_len: 65536 # tunable
|
||||
enable_chunked_prefill: true
|
||||
attn_backend: flashinfer
|
||||
model_factory: AutoModelForCausalLM
|
||||
skip_loading_weights: false
|
||||
sampler_type: "TRTLLMSampler"
|
||||
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
|
||||
kv_cache_config:
|
||||
free_gpu_memory_fraction: 0.88
|
||||
# tunable mamba cache dtype
|
||||
# --> use float32 for accuracy and default (auto) for speed
|
||||
mamba_ssm_cache_dtype: auto
|
||||
transforms:
|
||||
detect_sharding:
|
||||
allreduce_strategy: SYMM_MEM
|
||||
sharding_dims: ['tp','ep', 'bmm']
|
||||
process_grid: {'tp': 8, 'ep': 1}
|
||||
manual_config:
|
||||
head_dim: 128
|
||||
tp_plan:
|
||||
# mamba SSM layer
|
||||
"in_proj": "mamba"
|
||||
"out_proj": "rowwise"
|
||||
# attention layer
|
||||
"q_proj": "colwise"
|
||||
"k_proj": "colwise"
|
||||
"v_proj": "colwise"
|
||||
"o_proj": "rowwise"
|
||||
# NOTE: consider not sharding shared experts and/or
|
||||
# latent projections at all, keeping them replicated.
|
||||
# To do so, comment out the corresponding entries.
|
||||
# moe layer: SHARED experts
|
||||
# "up_proj": "colwise"
|
||||
# "down_proj": "rowwise"
|
||||
# MoLE: latent projections: simple shard
|
||||
# "fc1_latent_proj": "gather"
|
||||
# "fc2_latent_proj": "gather"
|
||||
multi_stream_moe:
|
||||
stage: compile
|
||||
enabled: false
|
||||
gather_logits_before_lm_head:
|
||||
# TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
|
||||
enabled: true
|
||||
fuse_mamba_a_log:
|
||||
stage: post_load_fusion
|
||||
enabled: true
|
||||
insert_cached_ssm_attention:
|
||||
backend: flashinfer_ssm
|
||||
Loading…
Reference in New Issue
Block a user