[TRTLLM-8830][test] Overlap scheduler enhancement perf test: Add qwen3_0,8b and llama3.1 test cases (#10114)

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-12-19 17:01:52 +08:00 · 2025-12-19 17:01:52 +08:00 · 52cee573ad
commit 52cee573ad
parent cb0444b1b5
3 changed files with 46 additions and 86 deletions
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@ -111,6 +111,7 @@ MODEL_PATH_DICT = {
    "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
    "qwen2_7b_instruct": "Qwen2-7B-Instruct",
    "qwen_14b_chat": "Qwen-14B-Chat",
+    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
    "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@ -94,7 +94,6 @@ llm_perf_core:
  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200]
  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200]
    #llama_v3.1_8b
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
  #mixtral_8x7b_v0.1
--- a/tests/integration/test_lists/qa/llm_perf_sanity.yml
+++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@ -1,6 +1,18 @@
 version: 0.0.1
 llm_perf_sanity:
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# ===============================================================================
+# Test Conditions Index
+# ===============================================================================
+# 1: All GPUs
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
+# 7: H20, H100, H200, B200, B300
+# ===============================================================================
+
+# 1: All GPUs
 - condition:
    ranges:
      system_gpu_count:
@ -28,10 +40,13 @@ llm_perf_sanity:
  - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
  - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
  - perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]


-# FP8 specific tests
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
 - condition:
    terms:
      supports_fp8: true
@ -45,94 +60,40 @@ llm_perf_sanity:
  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
  - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
  - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
-
-# Tests for ALL systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
- condition:
-    ranges:
-      system_gpu_count:
-        gte: 2
-  tests:
-  #llama_v3.1_8b_instruct
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
-
-# FP8 tests for systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 2
-
-  tests:
-  #mixtral_8x7b_v0.1_fp8 pytorch backend
  - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]

-# Tests for systems with 2+ GPUs and high memory
-# A100, L40S, H20, H100, H200, Blackwell
+
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
    ranges:
-      system_gpu_count:
-        gte: 2
      gpu_memory:
        gt: 80000
-
  tests:
  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
  - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
-
-# Tests for systems with 4+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  #llama_v3.1_70b
-  #trt backend
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
-
-# FP8 specific tests
-# L40S, H20, H100, H200, Blackwell
- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-
-# Tests for systems with 8+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
- condition:
-    ranges:
-      system_gpu_count:
-        gte: 8
-      gpu_memory:
-        gt: 46000
-
-  tests:
-  #llama_v3.1_70b
-  #pytorch backend
  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]

-# FP8 tests for systems with 8+ GPUs
-# L40S, H20, H100, H200, Blackwell
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+
+  tests:
+    # llama_v3.1_70b
+    # trt backend
+    # pytorch backend
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
+   # test overlap scheduler
+  - perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200]
+
+
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
    terms:
      supports_fp8: true
@ -141,17 +102,15 @@ llm_perf_sanity:
        gte: 8

  tests:
-  #llama_v3.1_70b
-  #trt backend
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8]
-  #llama_v3.3_70b_instruct_fp8
-  #pytorch backend
  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
  - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]


-# FP4, FP8 tests for systems with 8+ GPUs
-# H20, H100, H200, Blackwell
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
    ranges:
      system_gpu_count:
@ -171,10 +130,11 @@ llm_perf_sanity:
  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
  - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
  - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
-    # gpt_oss_20b_fp4
+  # gpt_oss_20b_fp4
  - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]

-# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported
+
+# 7: H20, H100, H200, B200, B300
 - condition:
    ranges:
      system_gpu_count: