TensorRT-LLMs/tests/scripts/perf-sanity/benchmark_config.yaml
chenfeiz0326 5cd8c0f6cc
[None][test] Add perf-sweep scripts (#6738)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
Signed-off-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com>
Co-authored-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com>
2025-08-14 14:04:47 +08:00

469 lines
8.6 KiB
YAML

test_cases:
- id: 1
model: "70B-FP8"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 2
model: "70B-FP8"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 3
model: "70B-FP8"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 4
model: "70B-FP8"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 5
model: "70B-FP4"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 6
model: "70B-FP4"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 7
model: "70B-FP4"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 8
model: "70B-FP4"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.9
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 16384
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 9
model: "Scout-FP8"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 10
model: "Scout-FP8"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9334
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 11
model: "Scout-FP8"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 12
model: "Scout-FP8"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9334
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 13
model: "Scout-FP4"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 14
model: "Scout-FP4"
gpus: 1
tp: 1
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9334
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 15
model: "Scout-FP4"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 16
model: "Scout-FP4"
gpus: 4
tp: 4
ep: 1
attn_backend: "TRTLLM"
moe_backend: ""
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9334
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- [64, 5]
- [512, 2]
- id: 17
model: "R1-FP8"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "DEEPGEMM"
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- id: 18
model: "R1-FP8"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "DEEPGEMM"
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 9344
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- id: 19
model: "R1-FP8"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "DEEPGEMM"
enable_attention_dp: true
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: 37376
concurrency_iterations:
- [64, 5]
- [512, 2]
- [4096, 2]
- id: 20
model: "R1-FP8"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "DEEPGEMM"
enable_attention_dp: true
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9344
moe_max_num_tokens: 9344
concurrency_iterations:
- [64, 5]
- [512, 2]
- [4096, 2]
- id: 21
model: "R1-FP4"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "TRTLLM"
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 1024
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- id: 22
model: "R1-FP4"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "TRTLLM"
enable_attention_dp: false
free_gpu_mem_fraction: 0.8
max_batch_size: 1024
isl: 8192
osl: 1024
max_num_tokens: 9344
moe_max_num_tokens: ""
concurrency_iterations:
- [1, 10]
- [8, 10]
- id: 23
model: "R1-FP4"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "CUTLASS"
enable_attention_dp: true
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 1024
osl: 1024
max_num_tokens: 2176
moe_max_num_tokens: 37376
concurrency_iterations:
- [64, 5]
- [512, 2]
- [4096, 2]
- id: 24
model: "R1-FP4"
gpus: 8
tp: 8
ep: 8
attn_backend: "TRTLLM"
moe_backend: "CUTLASS"
enable_attention_dp: true
free_gpu_mem_fraction: 0.8
max_batch_size: 512
isl: 8192
osl: 1024
max_num_tokens: 9344
moe_max_num_tokens: 9344
concurrency_iterations:
- [64, 5]
- [512, 2]
- [4096, 2]