|
|
|
|
@ -3,33 +3,20 @@ llm_perf_nim:
|
|
|
|
|
# ===============================================================================
|
|
|
|
|
# Test Conditions Index
|
|
|
|
|
# ===============================================================================
|
|
|
|
|
# 1: All GPUs common tests
|
|
|
|
|
# 2: A100, L20, L40S, H100, H20, H200
|
|
|
|
|
# 3: A100, L40S, H100, H20, H200
|
|
|
|
|
# 4: A100, H100, H20, H200 test cases
|
|
|
|
|
# 5: L40S, H100, H200, H20, B200, B300 test cases
|
|
|
|
|
# 6: L40S, H100, H200, H20, GB200, GB300 test cases
|
|
|
|
|
# 7: H100, H200, H20 common test cases
|
|
|
|
|
# 8: L20, L40S, H100, H200, H20 common test cases
|
|
|
|
|
# 9: H20, H200 test cases
|
|
|
|
|
# 10: L20, L40S, H100, H200, H20, B200, GB200, B300, GB300 common test cases
|
|
|
|
|
# 11: B200, GB200, B300, GB300, RTX6000-Server common test cases
|
|
|
|
|
# 12: B200, B300, RTX6000-Server test cases
|
|
|
|
|
# 13: B200, GB200, B300, GB300 test cases
|
|
|
|
|
# 14: B200, B300 test cases
|
|
|
|
|
# 1: A100, L20, L40S, H100, H20, H200
|
|
|
|
|
# 2: A100, L40S, H100, H20, H200
|
|
|
|
|
# 3: A100, H100, H20, H200 test cases
|
|
|
|
|
# 4: L40S, H100, H200, H20, GB200, GB300 test cases
|
|
|
|
|
# 5: H100, H200, H20 common test cases
|
|
|
|
|
# 6: L20, L40S, H100, H200, H20 common test cases
|
|
|
|
|
# 7: H20, H200 test cases
|
|
|
|
|
# 8: B200, GB200, B300, GB300, RTX6000-Server common test cases
|
|
|
|
|
# 9: B200, B300, RTX6000-Server test cases
|
|
|
|
|
# 10: B200, GB200, B300, GB300 test cases
|
|
|
|
|
# 11: B200, B300 test cases
|
|
|
|
|
# ===============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 1: All GPUs common tests
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
gte: 1
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 2: A100, L20, L40S, H100, H20, H200
|
|
|
|
|
# 1: A100, L20, L40S, H100, H20, H200
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -37,22 +24,10 @@ llm_perf_nim:
|
|
|
|
|
compute_capability:
|
|
|
|
|
lt: 10.0
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,32]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:1-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:250-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:128,128]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-loras:8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 3: A100, L40S, H100, H20, H200
|
|
|
|
|
# 2: A100, L40S, H100, H20, H200
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -65,7 +40,7 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:512,32-gpus:4]
|
|
|
|
|
|
|
|
|
|
# 4: A100, H100, H20, H200 test cases
|
|
|
|
|
# 3: A100, H100, H20, H200 test cases
|
|
|
|
|
# GPU memory > 80GB
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
@ -81,33 +56,9 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:8-con:1-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 5: L40S, H100, H200, H20, B200, B300 test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
gte: 8
|
|
|
|
|
compute_capability:
|
|
|
|
|
gt: 8.0
|
|
|
|
|
lte: 10.3
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:64-con:250-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:8-con:1-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 6: L40S, H100, H200, H20, GB200, GB300 test cases
|
|
|
|
|
# 4: L40S, H100, H200, H20, GB200, GB300 test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -117,14 +68,6 @@ llm_perf_nim:
|
|
|
|
|
lte: 10.3
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-con:250-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:4-con:1-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
|
|
|
|
@ -134,7 +77,7 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4]
|
|
|
|
|
|
|
|
|
|
# 7: H100, H200, H20 common test cases
|
|
|
|
|
# 5: H100, H200, H20 common test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -143,22 +86,6 @@ llm_perf_nim:
|
|
|
|
|
gte: 9.0
|
|
|
|
|
lte: 9.0
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:500-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250]
|
|
|
|
|
@ -174,15 +101,8 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:10-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-reqs:10-con:1]
|
|
|
|
|
|
|
|
|
|
# 8: L20, L40S, H100, H200, H20 common test cases
|
|
|
|
|
# 6: L20, L40S, H100, H200, H20 common test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -191,16 +111,9 @@ llm_perf_nim:
|
|
|
|
|
gt: 8.0
|
|
|
|
|
lte: 9.0
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,200-quant:fp8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,32-quant:fp8-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:8-con:1]
|
|
|
|
|
|
|
|
|
|
# 9: H20, H200 test cases
|
|
|
|
|
# 7: H20, H200 test cases
|
|
|
|
|
# gpu_memory > 100GB
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
@ -215,36 +128,13 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:1-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:50-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:64-gpus:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 10: L20, L40S, H100, H200, H20, B200, GB200, B300, GB300 test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
gte: 2
|
|
|
|
|
compute_capability:
|
|
|
|
|
gt: 8.0
|
|
|
|
|
lte: 10.3
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a16_awq]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a8_awq]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:1000,1000-quant:fp8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:500,2000-quant:fp8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 11: B200, GB200, B300, GB300, RTX6000-Server common test cases
|
|
|
|
|
# 8: B200, GB200, B300, GB300, RTX6000-Server common test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -254,13 +144,10 @@ llm_perf_nim:
|
|
|
|
|
lte: 12.0
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-maxbs:2048-maxnt:8192-input_output_len:256,256-reqs:200]
|
|
|
|
|
# Phi-4-multimodal-instruct
|
|
|
|
|
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250]
|
|
|
|
|
@ -274,19 +161,14 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120)
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200] TIMEOUT(120)
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:512,32-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
|
|
|
|
|
#Mistral-Small-3.1-24B-Instruct-2503
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2]
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120)
|
|
|
|
|
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200-gpus:2] TIMEOUT(120)
|
|
|
|
|
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
|
|
|
|
@ -309,7 +191,7 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 12: B200, B300, RTX6000-Server test cases
|
|
|
|
|
# 9: B200, B300, RTX6000-Server test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -318,7 +200,6 @@ llm_perf_nim:
|
|
|
|
|
gte: 10.0
|
|
|
|
|
lte: 12.0
|
|
|
|
|
tests:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8]
|
|
|
|
|
#llama_v3.3_70b_instruct_fp4
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
|
|
|
|
|
@ -329,8 +210,6 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
|
|
|
|
|
#deepseek_r1_fp8
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
|
|
|
|
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
|
|
|
|
|
@ -365,7 +244,7 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 13: B200, GB200, B300, GB300 test cases
|
|
|
|
|
# 10: B200, GB200, B300, GB300 test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
@ -382,7 +261,7 @@ llm_perf_nim:
|
|
|
|
|
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 14: B200, B300 test cases
|
|
|
|
|
# 11: B200, B300 test cases
|
|
|
|
|
- condition:
|
|
|
|
|
ranges:
|
|
|
|
|
system_gpu_count:
|
|
|
|
|
|