[TRTLLM-8830][test] Overlap scheduler enhancement perf test: Add qwen3_0,8b and llama3.1 test cases (#10114)

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
This commit is contained in:
yufeiwu-nv 2025-12-19 17:01:52 +08:00 committed by GitHub
parent cb0444b1b5
commit 52cee573ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 46 additions and 86 deletions

View File

@ -111,6 +111,7 @@ MODEL_PATH_DICT = {
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
"qwen_14b_chat": "Qwen-14B-Chat",
"qwen3_0.6b": "Qwen3/Qwen3-0.6B",
"qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",

View File

@ -94,7 +94,6 @@ llm_perf_core:
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200]
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200]
#llama_v3.1_8b
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
#mixtral_8x7b_v0.1

View File

@ -1,6 +1,18 @@
version: 0.0.1
llm_perf_sanity:
# A100, L40S, L20, H20, H100, H200, Blackwell
# ===============================================================================
# Test Conditions Index
# ===============================================================================
# 1: All GPUs
# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
# 6: H20, H100, H200, B200, B300, RTX6000-Server
# 7: H20, H100, H200, B200, B300
# ===============================================================================
# 1: All GPUs
- condition:
ranges:
system_gpu_count:
@ -28,10 +40,13 @@ llm_perf_sanity:
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
- perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
- perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
# FP8 specific tests
# A100, L40S, L20, H20, H100, H200, Blackwell
# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
- condition:
terms:
supports_fp8: true
@ -45,49 +60,23 @@ llm_perf_sanity:
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
- perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
- perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
# Tests for ALL systems with 2+ GPUs
# A100, L40S, L20, H20, H100, H200, Blackwell
- condition:
ranges:
system_gpu_count:
gte: 2
tests:
#llama_v3.1_8b_instruct
#pytorch backend
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
# FP8 tests for systems with 2+ GPUs
# A100, L40S, L20, H20, H100, H200, Blackwell
- condition:
terms:
supports_fp8: true
ranges:
system_gpu_count:
gte: 2
tests:
#mixtral_8x7b_v0.1_fp8 pytorch backend
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
# Tests for systems with 2+ GPUs and high memory
# A100, L40S, H20, H100, H200, Blackwell
# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
- condition:
ranges:
system_gpu_count:
gte: 2
gpu_memory:
gt: 80000
tests:
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
# Tests for systems with 4+ GPUs
# A100, L40S, H20, H100, H200, Blackwell
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
- condition:
ranges:
system_gpu_count:
@ -99,59 +88,29 @@ llm_perf_sanity:
# pytorch backend
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
# test overlap scheduler
- perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200]
# FP8 specific tests
# L40S, H20, H100, H200, Blackwell
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
- condition:
terms:
supports_fp8: true
ranges:
system_gpu_count:
gte: 4
gte: 8
tests:
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
# Tests for systems with 8+ GPUs
# A100, L40S, H20, H100, H200, Blackwell
- condition:
ranges:
system_gpu_count:
gte: 8
gpu_memory:
gt: 46000
tests:
#llama_v3.1_70b
#pytorch backend
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
# FP8 tests for systems with 8+ GPUs
# L40S, H20, H100, H200, Blackwell
- condition:
terms:
supports_fp8: true
ranges:
system_gpu_count:
gte: 8
tests:
#llama_v3.1_70b
#trt backend
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8]
#llama_v3.3_70b_instruct_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
# FP4, FP8 tests for systems with 8+ GPUs
# H20, H100, H200, Blackwell
# 6: H20, H100, H200, B200, B300, RTX6000-Server
- condition:
ranges:
system_gpu_count:
@ -174,7 +133,8 @@ llm_perf_sanity:
# gpt_oss_20b_fp4
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported
# 7: H20, H100, H200, B200, B300
- condition:
ranges:
system_gpu_count: