TensorRT-LLMs/tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml
chenfeiz0326 56073f501a
[TRTLLM-8263][feat] Add Aggregated Perf Tests (#10598)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
2026-01-17 13:16:36 +08:00

135 lines
3.4 KiB
YAML

metadata:
model_name: k2_thinking_fp4
supported_gpus:
- B200
server_configs:
# 8k1k configs - TEP8 with TRTLLM
- name: "k2_thinking_fp4_tep8_8k1k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 12288
trust_remote_code: true
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
client_configs:
- name: "con2_iter10_8k1k"
concurrency: 2
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
# 8k1k configs - DEP8 with CUTLASS
- name: "k2_thinking_fp4_dep8_8k1k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 64
max_num_tokens: 12288
trust_remote_code: true
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
client_configs:
- name: "con512_iter10_8k1k"
concurrency: 512
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
# 32k8k configs - TEP8 with TRTLLM
- name: "k2_thinking_fp4_tep8_32k8k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 8192
trust_remote_code: true
attn_backend: "TRTLLM"
enable_chunked_prefill: true
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
client_configs:
- name: "con2_iter10_32k8k"
concurrency: 2
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
# 32k8k configs - DEP8 with CUTLASS
- name: "k2_thinking_fp4_dep8_32k8k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 8192
trust_remote_code: true
attn_backend: "TRTLLM"
enable_chunked_prefill: true
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
client_configs:
- name: "con128_iter10_32k8k"
concurrency: 128
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true