mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com> Signed-off-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> Co-authored-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com>
469 lines
8.6 KiB
YAML
469 lines
8.6 KiB
YAML
test_cases:
|
|
- id: 1
|
|
model: "70B-FP8"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 2
|
|
model: "70B-FP8"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 3
|
|
model: "70B-FP8"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 4
|
|
model: "70B-FP8"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 5
|
|
model: "70B-FP4"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 6
|
|
model: "70B-FP4"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 7
|
|
model: "70B-FP4"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 8
|
|
model: "70B-FP4"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.9
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 16384
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 9
|
|
model: "Scout-FP8"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 10
|
|
model: "Scout-FP8"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9334
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 11
|
|
model: "Scout-FP8"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 12
|
|
model: "Scout-FP8"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9334
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 13
|
|
model: "Scout-FP4"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 14
|
|
model: "Scout-FP4"
|
|
gpus: 1
|
|
tp: 1
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9334
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 15
|
|
model: "Scout-FP4"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 16
|
|
model: "Scout-FP4"
|
|
gpus: 4
|
|
tp: 4
|
|
ep: 1
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: ""
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9334
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
- [64, 5]
|
|
- [512, 2]
|
|
|
|
- id: 17
|
|
model: "R1-FP8"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "DEEPGEMM"
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
|
|
- id: 18
|
|
model: "R1-FP8"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "DEEPGEMM"
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9344
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
|
|
- id: 19
|
|
model: "R1-FP8"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "DEEPGEMM"
|
|
enable_attention_dp: true
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: 37376
|
|
concurrency_iterations:
|
|
- [64, 5]
|
|
- [512, 2]
|
|
- [4096, 2]
|
|
|
|
- id: 20
|
|
model: "R1-FP8"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "DEEPGEMM"
|
|
enable_attention_dp: true
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9344
|
|
moe_max_num_tokens: 9344
|
|
concurrency_iterations:
|
|
- [64, 5]
|
|
- [512, 2]
|
|
- [4096, 2]
|
|
|
|
- id: 21
|
|
model: "R1-FP4"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "TRTLLM"
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 1024
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
|
|
- id: 22
|
|
model: "R1-FP4"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "TRTLLM"
|
|
enable_attention_dp: false
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 1024
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9344
|
|
moe_max_num_tokens: ""
|
|
concurrency_iterations:
|
|
- [1, 10]
|
|
- [8, 10]
|
|
|
|
- id: 23
|
|
model: "R1-FP4"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "CUTLASS"
|
|
enable_attention_dp: true
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 1024
|
|
osl: 1024
|
|
max_num_tokens: 2176
|
|
moe_max_num_tokens: 37376
|
|
concurrency_iterations:
|
|
- [64, 5]
|
|
- [512, 2]
|
|
- [4096, 2]
|
|
|
|
- id: 24
|
|
model: "R1-FP4"
|
|
gpus: 8
|
|
tp: 8
|
|
ep: 8
|
|
attn_backend: "TRTLLM"
|
|
moe_backend: "CUTLASS"
|
|
enable_attention_dp: true
|
|
free_gpu_mem_fraction: 0.8
|
|
max_batch_size: 512
|
|
isl: 8192
|
|
osl: 1024
|
|
max_num_tokens: 9344
|
|
moe_max_num_tokens: 9344
|
|
concurrency_iterations:
|
|
- [64, 5]
|
|
- [512, 2]
|
|
- [4096, 2]
|