TensorRT-LLMs/examples/configs/curated/qwen3-next.yaml

17 lines
363 B
YAML

max_batch_size: 16
max_num_tokens: 4096
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 720
moe_config:
backend: TRTLLM
stream_interval: 20
num_postprocess_workers: 4
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9