mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[None][test] Add DGX-Spark multinode perf cases including eagle3 (#11184)
Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com> Co-authored-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
This commit is contained in:
parent
f33086914f
commit
b5508ed75b
@ -234,6 +234,29 @@ def get_model_yaml_config(model_label: str,
|
||||
'enable_chunked_prefill': False,
|
||||
}
|
||||
},
|
||||
# Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding
|
||||
{
|
||||
'patterns': [
|
||||
'qwen3_235b_a22b_fp4_eagle3-bench-pytorch',
|
||||
],
|
||||
'config': {
|
||||
'enable_attention_dp': False,
|
||||
'disable_overlap_scheduler': False,
|
||||
'enable_autotuner': False,
|
||||
'enable_chunked_prefill': False,
|
||||
'speculative_config': {
|
||||
'decoding_type':
|
||||
'Eagle',
|
||||
'max_draft_len':
|
||||
3,
|
||||
'speculative_model_dir':
|
||||
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3",
|
||||
},
|
||||
'kv_cache_config': {
|
||||
'enable_block_reuse': False,
|
||||
},
|
||||
}
|
||||
},
|
||||
# Llama-v3.3 models with fp8 quantization
|
||||
{
|
||||
'patterns': [
|
||||
|
||||
@ -128,6 +128,7 @@ MODEL_PATH_DICT = {
|
||||
"qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
|
||||
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
|
||||
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||
"qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||
"qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
|
||||
"qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
|
||||
"qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
|
||||
@ -1507,14 +1508,32 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
||||
|
||||
# Construct MPI command.
|
||||
mpi_cmd = []
|
||||
if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
|
||||
if cpu_socket_count_gt_1():
|
||||
mpi_cmd = [
|
||||
"mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
|
||||
"--allow-run-as-root"
|
||||
]
|
||||
else:
|
||||
mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"]
|
||||
if num_gpus > 1 and num_gpus <= 8:
|
||||
# For bench runtime: optionally use mpirun to propagate environment variables.
|
||||
# Set TRTLLM_BENCH_USE_MPIRUN=1 to enable (needed for newer GPUs like GB10
|
||||
# where Triton's bundled ptxas doesn't support the architecture).
|
||||
if self._config.runtime == "bench" and os.getenv(
|
||||
"TRTLLM_BENCH_USE_MPIRUN"):
|
||||
mpi_cmd = ["mpirun", "-n", f"{num_gpus}"]
|
||||
|
||||
# Pass environment variables that are set
|
||||
for var in ["CPATH", "TRITON_PTXAS_PATH", "TRTLLM_LOG_LEVEL"]:
|
||||
if os.getenv(var):
|
||||
mpi_cmd.extend(["-x", var])
|
||||
|
||||
mpi_cmd.append("trtllm-llmapi-launch")
|
||||
elif self._config.runtime != "bench":
|
||||
# Non-bench runtimes (original behavior)
|
||||
if cpu_socket_count_gt_1():
|
||||
mpi_cmd = [
|
||||
"mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
|
||||
"--allow-run-as-root"
|
||||
]
|
||||
else:
|
||||
mpi_cmd = [
|
||||
"mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"
|
||||
]
|
||||
|
||||
if self._build_script == "trtllm-bench":
|
||||
return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds,
|
||||
mpi_cmd, is_python)
|
||||
|
||||
@ -48,3 +48,21 @@ llm_spark_perf:
|
||||
- perf/test_perf.py::test_perf[gemma_3_12b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
|
||||
- perf/test_perf.py::test_perf[gemma_3_12b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
|
||||
- perf/test_perf.py::test_perf[gemma_3_12b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
|
||||
# ===============================================================================
|
||||
# 2: Multi-GPU (2 GPUs) Spark perf cases with multinode support
|
||||
# ===============================================================================
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
lte: 2
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_distill_llama_70b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1-tp:2-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
|
||||
# Qwen3-235B-A22B-FP4 with Eagle3 speculative decoding
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4_eagle3-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-kv_cache_dtype:fp8-reqs:1-con:1-tp:2-gpus:2]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user