mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][fix] Minor updates on Perf Test System (#10375)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
parent
098251648d
commit
5e0e48144f
@ -939,8 +939,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
|
||||
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
|
||||
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
|
||||
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
|
||||
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"
|
||||
|
||||
stage("[${stageName}] Initializing Test") {
|
||||
// Create Job Workspace folder in Frontend Node
|
||||
@ -1023,16 +1021,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
coverageConfigFile
|
||||
)
|
||||
|
||||
if (perfSanityMode) {
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
perfCheckScriptLocal,
|
||||
perfCheckScriptNode,
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
// Generate Pytest command
|
||||
String pytestUtil = ""
|
||||
if (nodeCount > 1) {
|
||||
@ -1314,22 +1302,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
),
|
||||
numRetries: 3
|
||||
)
|
||||
|
||||
if (perfSanityMode) {
|
||||
stage("[${stageName}] Check perf result") {
|
||||
def perfCheckResult = Utils.exec(
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
|
||||
),
|
||||
returnStatus: true
|
||||
)
|
||||
if (perfCheckResult != 0) {
|
||||
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Finished test stage execution."
|
||||
@ -3297,15 +3269,10 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
|
||||
]
|
||||
|
||||
@ -135,6 +135,14 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
|
||||
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
|
||||
echo "Check Perf-Sanity Result"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||
$jobWorkspace
|
||||
perf_sanity_check_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ "$pytest_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$pytest_exit_code
|
||||
elif [ "$perf_check_exit_code" -ne 0 ]; then
|
||||
|
||||
@ -62,6 +62,10 @@ MINIMIZE_METRICS = [
|
||||
"d_p99_e2el",
|
||||
]
|
||||
|
||||
# Default threshold values for performance regression detection
|
||||
POST_MERGE_THRESHOLD = 0.05
|
||||
PRE_MERGE_THRESHOLD = 0.1
|
||||
|
||||
# Fields for scenario-only matching for recipe tests.
|
||||
# Unlike regular tests that match on all config fields, recipes match only on the benchmark
|
||||
# scenario, allowing the underlying config to change while still comparing against baselines
|
||||
|
||||
@ -463,6 +463,7 @@ class DisaggConfig:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
disagg_serving_type: str,
|
||||
hostname: str,
|
||||
numa_bind: bool,
|
||||
@ -472,6 +473,7 @@ class DisaggConfig:
|
||||
hardware: dict,
|
||||
server_env_var: str,
|
||||
):
|
||||
self.name = name
|
||||
self.disagg_serving_type = disagg_serving_type
|
||||
self.hostname = hostname
|
||||
self.numa_bind = numa_bind
|
||||
@ -971,7 +973,7 @@ class PerfSanityTestConfig:
|
||||
# Create ctx server config
|
||||
ctx_server_config_data = {
|
||||
"concurrency": max(concurrency_values),
|
||||
"name": f"ctx_{config_file_base_name}",
|
||||
"name": config_file_base_name,
|
||||
"model_name": model_name,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
**worker_config.get("ctx", {}),
|
||||
@ -980,7 +982,7 @@ class PerfSanityTestConfig:
|
||||
# Create gen server config
|
||||
gen_server_config_data = {
|
||||
"concurrency": max(concurrency_values),
|
||||
"name": f"gen_{config_file_base_name}",
|
||||
"name": config_file_base_name,
|
||||
"model_name": model_name,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
**worker_config.get("gen", {}),
|
||||
@ -991,6 +993,7 @@ class PerfSanityTestConfig:
|
||||
|
||||
# Create disagg config
|
||||
disagg_config = DisaggConfig(
|
||||
name=config_file_base_name,
|
||||
disagg_serving_type=disagg_serving_type,
|
||||
hostname=socket.gethostname(),
|
||||
numa_bind=numa_bind,
|
||||
@ -1249,6 +1252,8 @@ class PerfSanityTestConfig:
|
||||
new_data.update(job_config)
|
||||
new_data.update(server_config_dict)
|
||||
new_data.update(client_config_dict)
|
||||
# Add test_case_name for convenient filtering on OpenSearch
|
||||
new_data["s_test_case_name"] = f"{server_config.name}-{client_config.name}"
|
||||
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES:
|
||||
if metric_name in self._test_results[cmd_idx]:
|
||||
@ -1308,12 +1313,13 @@ class PerfSanityTestConfig:
|
||||
"l_num_gen_servers": num_gen_servers,
|
||||
}
|
||||
new_data.update(job_config)
|
||||
|
||||
if num_ctx_servers > 0:
|
||||
new_data.update(ctx_server_config_dict)
|
||||
if num_gen_servers > 0:
|
||||
new_data.update(gen_server_config_dict)
|
||||
new_data.update(client_config_dict)
|
||||
# Add test_case_name for convenient filtering on OpenSearch
|
||||
new_data["s_test_case_name"] = f"{disagg_config.name}-{client_config.name}"
|
||||
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES:
|
||||
if metric_name in self._test_results[cmd_idx]:
|
||||
|
||||
@ -14,20 +14,20 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -42,17 +42,17 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user