mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-8830][test] Overlap scheduler enhancement perf test: Add qwen3_0,8b and llama3.1 test cases (#10114)
Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
This commit is contained in:
parent
cb0444b1b5
commit
52cee573ad
@ -111,6 +111,7 @@ MODEL_PATH_DICT = {
|
||||
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
|
||||
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
|
||||
"qwen_14b_chat": "Qwen-14B-Chat",
|
||||
"qwen3_0.6b": "Qwen3/Qwen3-0.6B",
|
||||
"qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
|
||||
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
|
||||
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||
|
||||
@ -94,7 +94,6 @@ llm_perf_core:
|
||||
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200]
|
||||
- perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200]
|
||||
#llama_v3.1_8b
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
|
||||
#mixtral_8x7b_v0.1
|
||||
|
||||
@ -1,6 +1,18 @@
|
||||
version: 0.0.1
|
||||
llm_perf_sanity:
|
||||
# A100, L40S, L20, H20, H100, H200, Blackwell
|
||||
# ===============================================================================
|
||||
# Test Conditions Index
|
||||
# ===============================================================================
|
||||
# 1: All GPUs
|
||||
# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 6: H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
# ===============================================================================
|
||||
|
||||
# 1: All GPUs
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -28,10 +40,13 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
|
||||
- perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
|
||||
|
||||
|
||||
# FP8 specific tests
|
||||
# A100, L40S, L20, H20, H100, H200, Blackwell
|
||||
# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
- condition:
|
||||
terms:
|
||||
supports_fp8: true
|
||||
@ -45,94 +60,40 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
|
||||
- perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
|
||||
- perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
|
||||
|
||||
# Tests for ALL systems with 2+ GPUs
|
||||
# A100, L40S, L20, H20, H100, H200, Blackwell
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
tests:
|
||||
#llama_v3.1_8b_instruct
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
|
||||
|
||||
# FP8 tests for systems with 2+ GPUs
|
||||
# A100, L40S, L20, H20, H100, H200, Blackwell
|
||||
- condition:
|
||||
terms:
|
||||
supports_fp8: true
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
|
||||
tests:
|
||||
#mixtral_8x7b_v0.1_fp8 pytorch backend
|
||||
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
|
||||
|
||||
# Tests for systems with 2+ GPUs and high memory
|
||||
# A100, L40S, H20, H100, H200, Blackwell
|
||||
|
||||
# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
gpu_memory:
|
||||
gt: 80000
|
||||
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
|
||||
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
|
||||
|
||||
# Tests for systems with 4+ GPUs
|
||||
# A100, L40S, H20, H100, H200, Blackwell
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
|
||||
tests:
|
||||
#llama_v3.1_70b
|
||||
#trt backend
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
|
||||
|
||||
# FP8 specific tests
|
||||
# L40S, H20, H100, H200, Blackwell
|
||||
- condition:
|
||||
terms:
|
||||
supports_fp8: true
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
||||
|
||||
# Tests for systems with 8+ GPUs
|
||||
# A100, L40S, H20, H100, H200, Blackwell
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
gpu_memory:
|
||||
gt: 46000
|
||||
|
||||
tests:
|
||||
#llama_v3.1_70b
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
|
||||
|
||||
# FP8 tests for systems with 8+ GPUs
|
||||
# L40S, H20, H100, H200, Blackwell
|
||||
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
|
||||
tests:
|
||||
# llama_v3.1_70b
|
||||
# trt backend
|
||||
# pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
|
||||
# test overlap scheduler
|
||||
- perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200]
|
||||
|
||||
|
||||
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
- condition:
|
||||
terms:
|
||||
supports_fp8: true
|
||||
@ -141,17 +102,15 @@ llm_perf_sanity:
|
||||
gte: 8
|
||||
|
||||
tests:
|
||||
#llama_v3.1_70b
|
||||
#trt backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8]
|
||||
#llama_v3.3_70b_instruct_fp8
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
|
||||
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
|
||||
|
||||
|
||||
# FP4, FP8 tests for systems with 8+ GPUs
|
||||
# H20, H100, H200, Blackwell
|
||||
# 6: H20, H100, H200, B200, B300, RTX6000-Server
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -171,10 +130,11 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
# gpt_oss_20b_fp4
|
||||
# gpt_oss_20b_fp4
|
||||
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
|
||||
|
||||
# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported
|
||||
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user