mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-18 00:35:04 +08:00
17 lines
363 B
YAML
17 lines
363 B
YAML
max_batch_size: 16
|
|
max_num_tokens: 4096
|
|
tensor_parallel_size: 4
|
|
moe_expert_parallel_size: 4
|
|
trust_remote_code: true
|
|
enable_attention_dp: false
|
|
cuda_graph_config:
|
|
enable_padding: true
|
|
max_batch_size: 720
|
|
moe_config:
|
|
backend: TRTLLM
|
|
stream_interval: 20
|
|
num_postprocess_workers: 4
|
|
kv_cache_config:
|
|
enable_block_reuse: false
|
|
free_gpu_memory_fraction: 0.9
|