[None][test] Add DGX-Spark multinode perf cases including eagle3 (#11184)

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com> Co-authored-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
2026-02-16 15:55:08 +08:00 · 2026-02-10 10:44:41 +08:00 · 2026-02-10 10:44:41 +08:00 · b5508ed75b
commit b5508ed75b
parent f33086914f
3 changed files with 68 additions and 8 deletions
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@ -234,6 +234,29 @@ def get_model_yaml_config(model_label: str,
                'enable_chunked_prefill': False,
            }
        },
+        # Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding
+        {
+            'patterns': [
+                'qwen3_235b_a22b_fp4_eagle3-bench-pytorch',
+            ],
+            'config': {
+                'enable_attention_dp': False,
+                'disable_overlap_scheduler': False,
+                'enable_autotuner': False,
+                'enable_chunked_prefill': False,
+                'speculative_config': {
+                    'decoding_type':
+                    'Eagle',
+                    'max_draft_len':
+                    3,
+                    'speculative_model_dir':
+                    f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3",
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                },
+            }
+        },
        # Llama-v3.3 models with fp8 quantization
        {
            'patterns': [
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@ -128,6 +128,7 @@ MODEL_PATH_DICT = {
    "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+    "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
    "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
    "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
    "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
@ -1507,14 +1508,32 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):

        # Construct MPI command.
        mpi_cmd = []
-        if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
-            if cpu_socket_count_gt_1():
-                mpi_cmd = [
-                    "mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
-                    "--allow-run-as-root"
-                ]
-            else:
-                mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"]
+        if num_gpus > 1 and num_gpus <= 8:
+            # For bench runtime: optionally use mpirun to propagate environment variables.
+            # Set TRTLLM_BENCH_USE_MPIRUN=1 to enable (needed for newer GPUs like GB10
+            # where Triton's bundled ptxas doesn't support the architecture).
+            if self._config.runtime == "bench" and os.getenv(
+                    "TRTLLM_BENCH_USE_MPIRUN"):
+                mpi_cmd = ["mpirun", "-n", f"{num_gpus}"]
+
+                # Pass environment variables that are set
+                for var in ["CPATH", "TRITON_PTXAS_PATH", "TRTLLM_LOG_LEVEL"]:
+                    if os.getenv(var):
+                        mpi_cmd.extend(["-x", var])
+
+                mpi_cmd.append("trtllm-llmapi-launch")
+            elif self._config.runtime != "bench":
+                # Non-bench runtimes (original behavior)
+                if cpu_socket_count_gt_1():
+                    mpi_cmd = [
+                        "mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
+                        "--allow-run-as-root"
+                    ]
+                else:
+                    mpi_cmd = [
+                        "mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"
+                    ]
+
        if self._build_script == "trtllm-bench":
            return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds,
                                           mpi_cmd, is_python)
--- a/tests/integration/test_lists/qa/llm_spark_perf.yml
+++ b/tests/integration/test_lists/qa/llm_spark_perf.yml
@ -48,3 +48,21 @@ llm_spark_perf:
  - perf/test_perf.py::test_perf[gemma_3_12b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
  - perf/test_perf.py::test_perf[gemma_3_12b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
  - perf/test_perf.py::test_perf[gemma_3_12b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+# ===============================================================================
+# 2: Multi-GPU (2 GPUs) Spark perf cases with multinode support
+# ===============================================================================
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 2
+        lte: 2
+  tests:
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[deepseek_r1_distill_llama_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
+   # Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4_eagle3-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]