mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Signed-off-by: Venky Ganesh <23023424+venkywonka@users.noreply.github.com> Co-authored-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
20 lines
423 B
YAML
20 lines
423 B
YAML
max_batch_size: 1024
|
|
max_num_tokens: 3200
|
|
kv_cache_free_gpu_memory_fraction: 0.8
|
|
tensor_parallel_size: 8
|
|
moe_expert_parallel_size: 8
|
|
trust_remote_code: true
|
|
enable_attention_dp: true
|
|
cuda_graph_config:
|
|
enable_padding: true
|
|
max_batch_size: 128
|
|
kv_cache_config:
|
|
dtype: fp8
|
|
stream_interval: 10
|
|
speculative_config:
|
|
decoding_type: MTP
|
|
num_nextn_predict_layers: 1
|
|
moe_config:
|
|
backend: DEEPGEMM
|
|
max_num_tokens: 3200
|