TensorRT-LLMs/examples/auto_deploy/nano_v3.yaml
Chenghao Zhang 18fbda5cdb
[None][feat] AutoDeploy: Add A_log fusion for Mamba layers (#9422)
Signed-off-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com>
2025-11-26 14:39:20 -08:00

30 lines
856 B
YAML

runtime: trtllm
compile_backend: torch-cudagraph
max_batch_size: 384
max_seq_len: 65536 # tunable
enable_chunked_prefill: true
attn_backend: flashinfer
model_factory: AutoModelForCausalLM
skip_loading_weights: false
free_mem_ratio: 0.9
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
kv_cache_config:
# disable kv_cache reuse since not supported for hybrid/ssm models
enable_block_reuse: false
transforms:
detect_sharding:
sharding_source: ['factory', 'heuristic']
sharding_dims: ['ep', 'bmm']
multi_stream_moe:
stage: compile
enabled: true
# tunable mamba cache dtype
# --> use float32 for accuracy and default (null) for speed
insert_cached_ssm_attention:
cache_config:
# mamba_dtype: float32
mamba_dtype: null
fuse_mamba_a_log:
stage: post_load_fusion
enabled: true