TensorRT-LLMs/examples/auto_deploy/nemotron_flash.yaml
Lucas Liebenwein 937f8f78a1
[None][doc] promote AutoDeploy to beta feature in docs (#10372)
Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
2026-01-02 18:46:31 -05:00

19 lines
591 B
YAML

compile_backend: torch-cudagraph
max_batch_size: 384
max_seq_len: 2097152
max_num_tokens: 8192
enable_chunked_prefill: true
model_factory: NemotronFlashForCausalLM
free_mem_ratio: 0.9
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 96, 128, 256, 320, 384]
kv_cache_config:
# disable kv_cache reuse since not supported for hybrid/ssm models
enable_block_reuse: false
transforms:
gather_logits_before_lm_head:
# TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
enabled: true
fuse_mamba_a_log:
stage: post_load_fusion
enabled: true