mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
19 lines
591 B
YAML
19 lines
591 B
YAML
compile_backend: torch-cudagraph
|
|
max_batch_size: 384
|
|
max_seq_len: 2097152
|
|
max_num_tokens: 8192
|
|
enable_chunked_prefill: true
|
|
model_factory: NemotronFlashForCausalLM
|
|
free_mem_ratio: 0.9
|
|
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 96, 128, 256, 320, 384]
|
|
kv_cache_config:
|
|
# disable kv_cache reuse since not supported for hybrid/ssm models
|
|
enable_block_reuse: false
|
|
transforms:
|
|
gather_logits_before_lm_head:
|
|
# TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
|
|
enabled: true
|
|
fuse_mamba_a_log:
|
|
stage: post_load_fusion
|
|
enabled: true
|