TensorRT-LLMs/tests/integration/defs/perf/perf_regression_check.py
chenfeiz0326 d70aeddc7f
[TRTLLM-8952][feat] Support Multi-Node Disagg Perf Test in CI (#9138)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
2025-12-26 22:50:53 +08:00

186 lines
5.5 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import yaml
METRICS = [
"seq_throughput",
"token_throughput",
"total_token_throughput",
"user_throughput",
"mean_tpot",
"median_tpot",
"p99_tpot",
"mean_ttft",
"median_ttft",
"p99_ttft",
"mean_itl",
"median_itl",
"p99_itl",
"mean_e2el",
"median_e2el",
"p99_e2el",
]
def should_skip_execution():
disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
if (
disagg_type.startswith("GEN")
or disagg_type.startswith("CTX")
or disagg_type == "DISAGG_SERVER"
):
return True
return False
def find_yaml_files(job_workspace, filename):
yaml_files = []
for root, dirs, files in os.walk(job_workspace):
for file in files:
if file == filename:
yaml_files.append(os.path.join(root, file))
return yaml_files
def read_yaml_data(yaml_files):
all_data = []
for file_path in yaml_files:
try:
with open(file_path, "r") as f:
data = yaml.safe_load(f)
if data:
if isinstance(data, list):
all_data.extend(data)
else:
all_data.append(data)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return all_data
def get_metric_keys():
metric_keys = set()
for metric in METRICS:
metric_keys.add(f"d_{metric}")
metric_keys.add(f"d_baseline_{metric}")
metric_keys.add(f"d_threshold_{metric}")
return metric_keys
def print_perf_data(data):
print("=== Metrics ===")
for metric in METRICS:
value_key = f"d_{metric}"
if value_key in data:
value = data.get(value_key, "N/A")
print(f'"{value_key}": {value}')
metric_keys = get_metric_keys()
print("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
for key in config_keys:
value = data[key]
print(f'"{key}": {value}')
def print_regression_data(data):
if "s_regression_info" in data:
print("=== Regression Info ===")
print(f"{data['s_regression_info']}")
metric_keys = get_metric_keys()
print("=== Metrics ===")
for metric in METRICS:
value_key = f"d_{metric}"
baseline_key = f"d_baseline_{metric}"
threshold_key = f"d_threshold_{metric}"
# Only print if at least one of the keys exists
if value_key in data or baseline_key in data or threshold_key in data:
value = data.get(value_key, "N/A")
baseline = data.get(baseline_key, "N/A")
threshold = data.get(threshold_key, "N/A")
# Calculate percentage difference between value and baseline
if (
isinstance(value, (int, float))
and isinstance(baseline, (int, float))
and baseline != 0
):
percentage = (value - baseline) / baseline * 100
percentage_str = f"{percentage:+.2f}%"
else:
percentage_str = "N/A"
print(
f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
)
print("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
for key in config_keys:
if key == "s_regression_info":
continue
value = data[key]
print(f'"{key}": {value}')
def main():
if should_skip_execution():
print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
return 0
job_workspace = sys.argv[1]
if not os.path.isdir(job_workspace):
print(f"Error: {job_workspace} is not a valid directory")
sys.exit(1)
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
all_perf_data = read_yaml_data(perf_data_files)
print(f"Found {len(all_perf_data)} perf data")
for i, data in enumerate(all_perf_data):
print(f"\n{'=' * 60}")
print(f"Perf Data #{i + 1}")
print("=" * 60)
print_perf_data(data)
print(f"\n{'=' * 60}\n")
regression_files = find_yaml_files(job_workspace, "regression.yaml")
all_regression_data = read_yaml_data(regression_files)
print(f"Found {len(all_regression_data)} regression data")
for i, data in enumerate(all_regression_data):
print(f"\n{'=' * 60}")
print(f"Regression Data #{i + 1}")
print("=" * 60)
print_regression_data(data)
if len(all_regression_data) == 0:
print("\n No regression data found. Perf check is successful.")
return 0
else:
print(
f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
)
return 1
if __name__ == "__main__":
sys.exit(main())