diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 74aa68f170..db2d6a12b5 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -234,6 +234,29 @@ def get_model_yaml_config(model_label: str, 'enable_chunked_prefill': False, } }, + # Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding + { + 'patterns': [ + 'qwen3_235b_a22b_fp4_eagle3-bench-pytorch', + ], + 'config': { + 'enable_attention_dp': False, + 'disable_overlap_scheduler': False, + 'enable_autotuner': False, + 'enable_chunked_prefill': False, + 'speculative_config': { + 'decoding_type': + 'Eagle', + 'max_draft_len': + 3, + 'speculative_model_dir': + f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3", + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + }, + } + }, # Llama-v3.3 models with fp8 quantization { 'patterns': [ diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index f971984cb7..3695cf7e29 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -128,6 +128,7 @@ MODEL_PATH_DICT = { "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct", "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8", "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4", @@ -1507,14 +1508,32 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): # Construct MPI command. mpi_cmd = [] - if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench": - if cpu_socket_count_gt_1(): - mpi_cmd = [ - "mpirun", "--map-by", "socket", "-n", f"{num_gpus}", - "--allow-run-as-root" - ] - else: - mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"] + if num_gpus > 1 and num_gpus <= 8: + # For bench runtime: optionally use mpirun to propagate environment variables. + # Set TRTLLM_BENCH_USE_MPIRUN=1 to enable (needed for newer GPUs like GB10 + # where Triton's bundled ptxas doesn't support the architecture). + if self._config.runtime == "bench" and os.getenv( + "TRTLLM_BENCH_USE_MPIRUN"): + mpi_cmd = ["mpirun", "-n", f"{num_gpus}"] + + # Pass environment variables that are set + for var in ["CPATH", "TRITON_PTXAS_PATH", "TRTLLM_LOG_LEVEL"]: + if os.getenv(var): + mpi_cmd.extend(["-x", var]) + + mpi_cmd.append("trtllm-llmapi-launch") + elif self._config.runtime != "bench": + # Non-bench runtimes (original behavior) + if cpu_socket_count_gt_1(): + mpi_cmd = [ + "mpirun", "--map-by", "socket", "-n", f"{num_gpus}", + "--allow-run-as-root" + ] + else: + mpi_cmd = [ + "mpirun", "-n", f"{num_gpus}", "--allow-run-as-root" + ] + if self._build_script == "trtllm-bench": return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds, mpi_cmd, is_python) diff --git a/tests/integration/test_lists/qa/llm_spark_perf.yml b/tests/integration/test_lists/qa/llm_spark_perf.yml index 5c4368e84e..713192e93c 100644 --- a/tests/integration/test_lists/qa/llm_spark_perf.yml +++ b/tests/integration/test_lists/qa/llm_spark_perf.yml @@ -48,3 +48,21 @@ llm_spark_perf: - perf/test_perf.py::test_perf[gemma_3_12b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[gemma_3_12b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[gemma_3_12b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] +# =============================================================================== +# 2: Multi-GPU (2 GPUs) Spark perf cases with multinode support +# =============================================================================== +- condition: + ranges: + system_gpu_count: + gte: 2 + lte: 2 + tests: + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[deepseek_r1_distill_llama_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2] + # Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4_eagle3-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]