mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][fix] Decrease Pre Merge Perf Tests (#10390)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com> Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
c4f27fa4c0
commit
a65b0d4efa
@ -2872,21 +2872,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
if (stageName.contains("PerfSanity")) {
|
||||
stage ("Check PerfSanity Result") {
|
||||
def perfCheckResult = sh(
|
||||
script: """
|
||||
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
|
||||
${WORKSPACE}/${stageName}
|
||||
""",
|
||||
returnStatus: true
|
||||
)
|
||||
if (perfCheckResult != 0) {
|
||||
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3319,21 +3304,12 @@ def launchTestJobs(pipeline, testFilter)
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
|
||||
// PerfSanity pre-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 2, 4],
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 3, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 3, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 3, 4],
|
||||
]
|
||||
fullSet += SBSASlurmTestConfigs.keySet()
|
||||
|
||||
|
||||
@ -102,7 +102,6 @@ set +e
|
||||
pytest_exit_code=0
|
||||
perf_check_exit_code=0
|
||||
perf_report_exit_code=0
|
||||
perf_sanity_check_exit_code=0
|
||||
|
||||
eval $pytestCommand
|
||||
pytest_exit_code=$?
|
||||
@ -154,20 +153,10 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
|
||||
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
|
||||
echo "Check PerfSanity Result"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||
$jobWorkspace
|
||||
perf_sanity_check_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ "$pytest_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$pytest_exit_code
|
||||
elif [ "$perf_check_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$perf_check_exit_code
|
||||
elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$perf_sanity_check_exit_code
|
||||
else
|
||||
final_exit_code=0
|
||||
fi
|
||||
|
||||
@ -23,7 +23,7 @@ import time
|
||||
from datetime import datetime
|
||||
|
||||
import yaml
|
||||
from defs.trt_test_alternative import print_info, print_warning
|
||||
from defs.trt_test_alternative import print_error, print_info, print_warning
|
||||
|
||||
_project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), '../../../..'))
|
||||
@ -660,3 +660,123 @@ def write_regressive_test_cases(regressive_data_list, new_data_dict,
|
||||
if len(regressive_data_list) > 0:
|
||||
print_warning(
|
||||
f"Found {len(regressive_data_list)} regressive test cases")
|
||||
|
||||
|
||||
def _get_metric_keys():
|
||||
"""Get all metric-related keys for filtering config keys."""
|
||||
metric_keys = set()
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
metric_suffix = metric[2:] # Strip "d_" prefix
|
||||
metric_keys.add(metric)
|
||||
metric_keys.add(f"d_baseline_{metric_suffix}")
|
||||
metric_keys.add(f"d_threshold_post_merge_{metric_suffix}")
|
||||
metric_keys.add(f"d_threshold_pre_merge_{metric_suffix}")
|
||||
return metric_keys
|
||||
|
||||
|
||||
def _print_perf_data(data):
|
||||
"""Print performance metrics and config for a single data entry."""
|
||||
print_info("=== Metrics ===")
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
if metric in data:
|
||||
value = data.get(metric, "N/A")
|
||||
print_info(f'"{metric}": {value}')
|
||||
|
||||
metric_keys = _get_metric_keys()
|
||||
print_info("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
for key in config_keys:
|
||||
value = data[key]
|
||||
print_info(f'"{key}": {value}')
|
||||
|
||||
|
||||
def _print_regression_data(data, print_func=None):
|
||||
"""
|
||||
Print regression info, metrics with baselines/thresholds, and config.
|
||||
"""
|
||||
if print_func is None:
|
||||
print_func = print_info
|
||||
|
||||
if "s_regression_info" in data:
|
||||
print_func("=== Regression Info ===")
|
||||
print_func(f"{data['s_regression_info']}")
|
||||
|
||||
metric_keys = _get_metric_keys()
|
||||
is_post_merge = data.get("b_is_post_merge", False)
|
||||
|
||||
print_func("=== Metrics ===")
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
metric_suffix = metric[2:] # Strip "d_" prefix
|
||||
baseline_key = f"d_baseline_{metric_suffix}"
|
||||
if is_post_merge:
|
||||
threshold_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||
else:
|
||||
threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||
# Only print if at least one of the keys exists
|
||||
if metric in data or baseline_key in data or threshold_key in data:
|
||||
value = data.get(metric, "N/A")
|
||||
baseline = data.get(baseline_key, "N/A")
|
||||
threshold = data.get(threshold_key, "N/A")
|
||||
# Calculate percentage difference between value and baseline
|
||||
# Positive percentage means better perf, negative means regression
|
||||
if (isinstance(value, (int, float))
|
||||
and isinstance(baseline, (int, float)) and baseline != 0):
|
||||
if metric in MAXIMIZE_METRICS:
|
||||
# Larger is better: value > baseline is positive (better)
|
||||
percentage = (value - baseline) / baseline * 100
|
||||
else:
|
||||
# Smaller is better: value < baseline is positive (better)
|
||||
percentage = (baseline - value) / baseline * 100
|
||||
percentage_str = f"{percentage:+.2f}%"
|
||||
else:
|
||||
percentage_str = "N/A"
|
||||
print_func(
|
||||
f'"{metric}": {value}, "{baseline_key}": {baseline}, '
|
||||
f'"{threshold_key}": {threshold}, "diff": {percentage_str}')
|
||||
|
||||
print_func("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
for key in config_keys:
|
||||
if key == "s_regression_info":
|
||||
continue
|
||||
value = data[key]
|
||||
print_func(f'"{key}": {value}')
|
||||
|
||||
|
||||
def check_perf_regression(regressive_data_list, new_data_dict):
|
||||
"""
|
||||
Check performance regression by printing regression data.
|
||||
"""
|
||||
# Split regression data into post-merge and pre-merge
|
||||
post_merge_regressions = [
|
||||
data for data in regressive_data_list
|
||||
if data.get("b_is_post_merge", False)
|
||||
]
|
||||
pre_merge_regressions = [
|
||||
data for data in regressive_data_list
|
||||
if not data.get("b_is_post_merge", False)
|
||||
]
|
||||
|
||||
# Print pre-merge regression data with print_warning
|
||||
if len(pre_merge_regressions) > 0:
|
||||
print_warning(
|
||||
f"Found {len(pre_merge_regressions)} pre-merge regression data")
|
||||
for i, data in enumerate(pre_merge_regressions):
|
||||
print_warning(f"\n{'=' * 60}")
|
||||
print_warning(f"Pre-merge Regression Data #{i + 1}")
|
||||
print_warning("=" * 60)
|
||||
_print_regression_data(data, print_func=print_warning)
|
||||
|
||||
# Print post-merge regression data with print_warning for content
|
||||
if len(post_merge_regressions) > 0:
|
||||
for i, data in enumerate(post_merge_regressions):
|
||||
print_warning(f"\n{'=' * 60}")
|
||||
print_warning(f"Post-merge Regression Data #{i + 1}")
|
||||
print_warning("=" * 60)
|
||||
_print_regression_data(data, print_func=print_warning)
|
||||
print_error(
|
||||
f"Found {len(post_merge_regressions)} post-merge regression data")
|
||||
|
||||
# Print summary if no regressions
|
||||
if len(regressive_data_list) == 0:
|
||||
print_info("No regression data found.")
|
||||
|
||||
@ -1,203 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
METRICS = [
|
||||
"seq_throughput",
|
||||
"token_throughput",
|
||||
"total_token_throughput",
|
||||
"user_throughput",
|
||||
"mean_tpot",
|
||||
"median_tpot",
|
||||
"p99_tpot",
|
||||
"mean_ttft",
|
||||
"median_ttft",
|
||||
"p99_ttft",
|
||||
"mean_itl",
|
||||
"median_itl",
|
||||
"p99_itl",
|
||||
"mean_e2el",
|
||||
"median_e2el",
|
||||
"p99_e2el",
|
||||
]
|
||||
|
||||
|
||||
def should_skip_execution():
|
||||
disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
|
||||
if (
|
||||
disagg_type.startswith("GEN")
|
||||
or disagg_type.startswith("CTX")
|
||||
or disagg_type == "DISAGG_SERVER"
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def find_yaml_files(job_workspace, filename):
|
||||
yaml_files = []
|
||||
for root, dirs, files in os.walk(job_workspace):
|
||||
for file in files:
|
||||
if file == filename:
|
||||
yaml_files.append(os.path.join(root, file))
|
||||
return yaml_files
|
||||
|
||||
|
||||
def read_yaml_data(yaml_files):
|
||||
all_data = []
|
||||
for file_path in yaml_files:
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data:
|
||||
if isinstance(data, list):
|
||||
all_data.extend(data)
|
||||
else:
|
||||
all_data.append(data)
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
return all_data
|
||||
|
||||
|
||||
def get_metric_keys():
|
||||
metric_keys = set()
|
||||
for metric in METRICS:
|
||||
metric_keys.add(f"d_{metric}")
|
||||
metric_keys.add(f"d_baseline_{metric}")
|
||||
metric_keys.add(f"d_threshold_{metric}")
|
||||
return metric_keys
|
||||
|
||||
|
||||
def print_perf_data(data):
|
||||
print("=== Metrics ===")
|
||||
for metric in METRICS:
|
||||
value_key = f"d_{metric}"
|
||||
if value_key in data:
|
||||
value = data.get(value_key, "N/A")
|
||||
print(f'"{value_key}": {value}')
|
||||
|
||||
metric_keys = get_metric_keys()
|
||||
print("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
for key in config_keys:
|
||||
value = data[key]
|
||||
print(f'"{key}": {value}')
|
||||
|
||||
|
||||
def print_regression_data(data):
|
||||
if "s_regression_info" in data:
|
||||
print("=== Regression Info ===")
|
||||
print(f"{data['s_regression_info']}")
|
||||
|
||||
metric_keys = get_metric_keys()
|
||||
|
||||
print("=== Metrics ===")
|
||||
for metric in METRICS:
|
||||
value_key = f"d_{metric}"
|
||||
baseline_key = f"d_baseline_{metric}"
|
||||
threshold_key = f"d_threshold_{metric}"
|
||||
# Only print if at least one of the keys exists
|
||||
if value_key in data or baseline_key in data or threshold_key in data:
|
||||
value = data.get(value_key, "N/A")
|
||||
baseline = data.get(baseline_key, "N/A")
|
||||
threshold = data.get(threshold_key, "N/A")
|
||||
# Calculate percentage difference between value and baseline
|
||||
if (
|
||||
isinstance(value, (int, float))
|
||||
and isinstance(baseline, (int, float))
|
||||
and baseline != 0
|
||||
):
|
||||
percentage = (value - baseline) / baseline * 100
|
||||
percentage_str = f"{percentage:+.2f}%"
|
||||
else:
|
||||
percentage_str = "N/A"
|
||||
print(
|
||||
f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
|
||||
f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
|
||||
)
|
||||
|
||||
print("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
for key in config_keys:
|
||||
if key == "s_regression_info":
|
||||
continue
|
||||
value = data[key]
|
||||
print(f'"{key}": {value}')
|
||||
|
||||
|
||||
def main():
|
||||
if should_skip_execution():
|
||||
print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
|
||||
return 0
|
||||
|
||||
job_workspace = sys.argv[1]
|
||||
|
||||
if not os.path.isdir(job_workspace):
|
||||
print(f"Skipping perf regression check since {job_workspace} is not a valid directory.")
|
||||
return 0
|
||||
|
||||
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
|
||||
all_perf_data = read_yaml_data(perf_data_files)
|
||||
print(f"Found {len(all_perf_data)} perf data")
|
||||
for i, data in enumerate(all_perf_data):
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Perf Data #{i + 1}")
|
||||
print("=" * 60)
|
||||
print_perf_data(data)
|
||||
|
||||
print(f"\n{'=' * 60}\n")
|
||||
|
||||
regression_files = find_yaml_files(job_workspace, "regression.yaml")
|
||||
all_regression_data = read_yaml_data(regression_files)
|
||||
print(f"Found {len(all_regression_data)} regression data")
|
||||
for i, data in enumerate(all_regression_data):
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Regression Data #{i + 1}")
|
||||
print("=" * 60)
|
||||
print_regression_data(data)
|
||||
|
||||
# Split regression data into post-merge and pre-merge categories
|
||||
post_merge_regressions = [
|
||||
data for data in all_regression_data if data.get("b_is_post_merge", False)
|
||||
]
|
||||
pre_merge_regressions = [
|
||||
data for data in all_regression_data if not data.get("b_is_post_merge", False)
|
||||
]
|
||||
|
||||
if len(all_regression_data) == 0:
|
||||
print("\n No regression data found. Perf check is successful.")
|
||||
return 0
|
||||
|
||||
if len(pre_merge_regressions) > 0:
|
||||
print(
|
||||
f"\n Warning: Found {len(pre_merge_regressions)} pre-merge regression data. "
|
||||
"But we don't fail the check temporarily."
|
||||
)
|
||||
|
||||
if len(post_merge_regressions) > 0:
|
||||
print(
|
||||
f"\n Error: Found {len(post_merge_regressions)} post-merge regression data. Perf check is failed."
|
||||
)
|
||||
return 1
|
||||
|
||||
print("\n No post-merge regression data found. Perf check is successful.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@ -37,13 +37,13 @@ from ..conftest import get_llm_root, llm_models_root
|
||||
from .open_search_db_utils import (
|
||||
SCENARIO_MATCH_FIELDS,
|
||||
add_id,
|
||||
check_perf_regression,
|
||||
get_common_values,
|
||||
get_history_data,
|
||||
get_job_info,
|
||||
post_new_perf_data,
|
||||
prepare_baseline_data,
|
||||
prepare_regressive_test_cases,
|
||||
write_regressive_test_cases,
|
||||
)
|
||||
from .utils import collect_and_clean_myelin_time
|
||||
|
||||
@ -781,16 +781,17 @@ class DisaggTestCmds(NamedTuple):
|
||||
return ["multi-node disaggregated server tests, please check config files"]
|
||||
|
||||
|
||||
def parse_select_pattern(select_pattern: str) -> str:
|
||||
"""Parse select pattern (server config name).
|
||||
def parse_select_pattern(select_pattern: str) -> list:
|
||||
"""Parse select pattern (server config names).
|
||||
|
||||
Args:
|
||||
select_pattern: Server config name (e.g., "r1_fp8_dep8_mtp1_1k1k").
|
||||
select_pattern: Server config names separated by comma
|
||||
(e.g., "r1_fp4_v2_dep4_mtp1_1k1k,r1_fp4_v2_tep4_mtp3_1k1k,r1_fp4_v2_tp4_mtp3_1k1k").
|
||||
|
||||
Returns:
|
||||
Server config name string.
|
||||
List of server config name strings.
|
||||
"""
|
||||
return select_pattern
|
||||
return [name.strip() for name in select_pattern.split(",")]
|
||||
|
||||
|
||||
class PerfSanityTestConfig:
|
||||
@ -873,11 +874,11 @@ class PerfSanityTestConfig:
|
||||
|
||||
def _parse_aggr_config_file(self, config_file_path: str):
|
||||
"""Parse YAML config file for aggregated server."""
|
||||
# Parse selection pattern (server config name)
|
||||
# Parse selection pattern (server config names)
|
||||
if self.select_pattern:
|
||||
selected_server_name = parse_select_pattern(self.select_pattern)
|
||||
selected_server_names = parse_select_pattern(self.select_pattern)
|
||||
else:
|
||||
selected_server_name = None
|
||||
selected_server_names = None
|
||||
|
||||
with open(config_file_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
@ -895,10 +896,10 @@ class PerfSanityTestConfig:
|
||||
server_client_configs = {}
|
||||
|
||||
for server_idx, server_config_data in enumerate(config["server_configs"]):
|
||||
# Check if this server should be included based on selected_server_name
|
||||
# Check if this server should be included based on selected_server_names
|
||||
if (
|
||||
selected_server_name is not None
|
||||
and server_config_data.get("name") != selected_server_name
|
||||
selected_server_names is not None
|
||||
and server_config_data.get("name") not in selected_server_names
|
||||
):
|
||||
continue
|
||||
|
||||
@ -1375,8 +1376,7 @@ class PerfSanityTestConfig:
|
||||
# Upload the new perf data and baseline data to database
|
||||
post_new_perf_data(new_baseline_data_dict, new_data_dict, regressive_data_list)
|
||||
|
||||
perf_result_output_dir = os.path.join(self._output_dir, self._test_param_labels)
|
||||
write_regressive_test_cases(regressive_data_list, new_data_dict, perf_result_output_dir)
|
||||
check_perf_regression(regressive_data_list, new_data_dict)
|
||||
|
||||
|
||||
# Perf sanity test case parameters
|
||||
@ -1444,10 +1444,13 @@ def get_disagg_test_cases() -> List[str]:
|
||||
return test_cases
|
||||
|
||||
|
||||
# Hardcoded multi-test test cases from test db.
|
||||
MULTI_TEST_TEST_CASES = []
|
||||
|
||||
# Generate all test case combinations
|
||||
# For aggr: {test_type}-{config_yml}, {test_type}-{config_yml}-{server_config_name}
|
||||
# For disagg: {test_type}-{config_yml}
|
||||
PERF_SANITY_TEST_CASES = get_aggr_test_cases() + get_disagg_test_cases()
|
||||
PERF_SANITY_TEST_CASES = get_aggr_test_cases() + get_disagg_test_cases() + MULTI_TEST_TEST_CASES
|
||||
|
||||
|
||||
@pytest.mark.parametrize("perf_sanity_test_case", PERF_SANITY_TEST_CASES)
|
||||
|
||||
@ -14,20 +14,12 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -45,14 +37,8 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user