TensorRT-LLMs/examples/auto_deploy/super_v3.yaml
Gal Hubara-Agam e98c27ee4f
[TRTLLM-10053][feat] AutoDeploy: Add Super v3 config file, improve test runtime (#10397)
Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
2026-01-05 18:17:27 +02:00

53 lines
1.6 KiB
YAML

runtime: trtllm
compile_backend: torch-cudagraph
max_batch_size: 384
max_seq_len: 65536 # tunable
enable_chunked_prefill: true
attn_backend: flashinfer
model_factory: AutoModelForCausalLM
skip_loading_weights: false
free_mem_ratio: 0.9
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
kv_cache_config:
# disable kv_cache reuse since not supported for hybrid/ssm models
enable_block_reuse: false
transforms:
detect_sharding:
allreduce_strategy: SYMM_MEM
sharding_dims: ['ep', 'bmm']
manual_config:
head_dim: 128
tp_plan:
# mamba SSM layer
"in_proj": "mamba"
"out_proj": "rowwise"
# attention layer
"q_proj": "colwise"
"k_proj": "colwise"
"v_proj": "colwise"
"o_proj": "rowwise"
# NOTE: consider not sharding shared experts and/or
# latent projections at all, keeping them replicated.
# To do so, comment out the corresponding entries.
# moe layer: SHARED experts
"up_proj": "colwise"
"down_proj": "rowwise"
# MoLE: latent projections: simple shard
"fc1_latent_proj": "gather"
"fc2_latent_proj": "gather"
multi_stream_moe:
stage: compile
enabled: false
# tunable mamba cache dtype
# --> use float32 for accuracy and default (null) for speed
insert_cached_ssm_attention:
cache_config:
# mamba_dtype: float32
mamba_dtype: null
gather_logits_before_lm_head:
# TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
enabled: true
fuse_mamba_a_log:
stage: post_load_fusion
enabled: true