TensorRT-LLMs/tests/integration/defs/perf/disagg/compare_backends.py
fredricz-20070104 f631b25c85
[None][test] Unified slurm extra args management and session collection logic (#10332)
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
Co-authored-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
2026-01-01 21:10:51 -05:00

514 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Compare performance test results between different backends (UCX vs NIXL)."""
import argparse
import os
import re
import sys
import pandas as pd
def extract_backend(test_name):
"""Extract backend type from test_name.
New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT
Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL
Note: "DEFAULT" is a special marker that represents the default backend
"""
match = re.search(r"ccb-(\w+)", test_name)
return match.group(1) if match else None
def extract_base_case_name(test_name):
"""Extract standardized case name (remove backend information).
Replace ccb-XXX with ccb-BACKEND to create a common base name for grouping.
Example: disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-NIXL -> disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-BACKEND
"""
# Replace ccb-XXX with ccb-BACKEND to normalize
pattern = r"ccb-\w+"
base_case = re.sub(pattern, "ccb-BACKEND", test_name)
return base_case
def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
"""Compare performance metrics between DEFAULT backend and UCX.
Only focus on cases where DEFAULT is slower than UCX.
Args:
csv_path: CSV file path
threshold: Performance difference threshold (percentage)
default_backend: DEFAULT backend name (currently NIXL, may switch in the future)
Cases marked as "ccb-DEFAULT" will be treated as this backend
Returns:
DataFrame: Comparison results
"""
if not os.path.exists(csv_path):
print(f"CSV file not found: {csv_path}")
sys.exit(0)
# Read CSV file
if not os.path.exists(csv_path):
print(f"CSV file not found: {csv_path}")
sys.exit(0)
df = pd.read_csv(csv_path)
if len(df) == 0:
print(f"No data found in CSV file: {csv_path}")
sys.exit(0)
# Filter only keep tests related to disagg_perf
# Determine from test_name field (new format: disagg_perf_{model_name}_...)
df = df[df["test_name"].str.contains("disagg_perf_", na=False)]
if len(df) == 0:
print(f"No disagg_perf tests found in CSV file: {csv_path}")
sys.exit(0)
# Extract backend and standardized case name
df["backend"] = df["test_name"].apply(extract_backend)
df["base_case_name"] = df["test_name"].apply(extract_base_case_name)
# Normalize "DEFAULT" backend to the actual default_backend value
# This allows cases marked as "ccb-DEFAULT" to be treated as the default backend
df["backend"] = df["backend"].apply(
lambda x: default_backend if x and x.upper() == "DEFAULT" else x
)
# Group by base_case_name and metric_type
grouped = df.groupby(["base_case_name", "metric_type"])
results = []
comparison_pairs = 0
single_backend_skipped = 0
for (base_case, metric_type), group in grouped:
# Get DEFAULT backend and UCX data
default_data = group[group["backend"] == default_backend]
ucx_data = group[group["backend"] == "UCX"]
# Skip if both have no data (this case may not exist)
if len(default_data) == 0 and len(ucx_data) == 0:
continue
# Skip single-backend cases (only has one backend, not a comparison pair)
# This happens when a test case only runs on one backend
if len(default_data) == 0 or len(ucx_data) == 0:
single_backend_skipped += 1
continue
# This is a valid comparison pair
comparison_pairs += 1
# Extract values and original test names
default_value = default_data["perf_metric"].values[0] if len(default_data) > 0 else None
default_original_name = (
default_data["network_name"].values[0] if len(default_data) > 0 else None
)
ucx_value = ucx_data["perf_metric"].values[0] if len(ucx_data) > 0 else None
ucx_original_name = ucx_data["network_name"].values[0] if len(ucx_data) > 0 else None
# Determine status
status = "Pass"
diff_pct = None
regression_pct = None
# If one has value and the other has no value, mark as Fail (test run failed)
if default_value is None or ucx_value is None:
status = "Fail"
elif ucx_value != 0:
# Calculate performance difference percentage
# For TTFT and E2EL metrics, smaller is better
# regression_pct > 0 means DEFAULT is slower than UCX (performance degradation)
# regression_pct < 0 means DEFAULT is faster than UCX (performance improvement)
regression_pct = ((default_value - ucx_value) / ucx_value) * 100
diff_pct = abs(regression_pct)
# Only fail if DEFAULT is slower than UCX and exceeds threshold
if regression_pct > threshold:
status = "Fail"
else:
status = "Pass"
else:
# UCX value is 0 is an abnormal case
if default_value != 0:
status = "Fail"
# Use original network names, or "N/A" if data doesn't exist
test_case_name_default = default_original_name if default_original_name else "N/A"
test_case_name_ucx = ucx_original_name if ucx_original_name else "N/A"
results.append(
{
"test_case_name_default": test_case_name_default,
"test_case_name_ucx": test_case_name_ucx,
"metric_type": metric_type,
"default_value": default_value,
"ucx_value": ucx_value,
"diff_pct": diff_pct,
"regression_pct": regression_pct,
"status": status,
}
)
# Print statistics
print("\n=== Backend Comparison Statistics ===")
print(f"Default backend: {default_backend}")
print(f"Comparison pairs: {comparison_pairs}")
print(f"Single-backend cases (skipped): {single_backend_skipped}")
print("=" * 37)
# If no comparison pairs found, exit with success
if comparison_pairs == 0:
print("\nInfo: No backend comparison pairs found in disagg_perf tests")
print("All cases are single-backend only, no comparison needed")
sys.exit(0)
# Convert to DataFrame
result_df = pd.DataFrame(results)
return result_df
def generate_html_report(result_df, threshold, default_backend, output_path):
"""Generate HTML format comparison report."""
# Statistics
total = len(result_df)
failed = len(result_df[result_df["status"] == "Fail"])
passed = total - failed
# HTML template
html_template = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Backend Comparison Report - DEFAULT vs UCX</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 20px;
background-color: #f5f5f5;
}}
.container {{
max-width: 1600px;
margin: 0 auto;
background-color: white;
padding: 30px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}}
h1 {{
color: #333;
border-bottom: 3px solid #4CAF50;
padding-bottom: 10px;
}}
.summary {{
display: flex;
justify-content: space-around;
margin: 30px 0;
}}
.summary-box {{
flex: 1;
margin: 0 10px;
padding: 20px;
border-radius: 8px;
text-align: center;
color: white;
}}
.summary-box.total {{
background-color: #2196F3;
}}
.summary-box.pass {{
background-color: #4CAF50;
}}
.summary-box.fail {{
background-color: #f44336;
}}
.summary-box h2 {{
margin: 0;
font-size: 36px;
}}
.summary-box p {{
margin: 5px 0 0 0;
font-size: 14px;
}}
.info {{
background-color: #e3f2fd;
padding: 15px;
border-radius: 5px;
margin-bottom: 20px;
}}
.warning-box {{
background-color: #fff3e0;
border-left: 4px solid #ff9800;
padding: 15px;
margin-bottom: 20px;
}}
table {{
width: 100%;
border-collapse: collapse;
margin-top: 20px;
font-size: 14px;
}}
th {{
background-color: #4CAF50;
color: white;
padding: 12px;
text-align: left;
position: sticky;
top: 0;
}}
td {{
padding: 10px;
border-bottom: 1px solid #ddd;
}}
tr:hover {{
background-color: #f5f5f5;
}}
.status-pass {{
background-color: #4CAF50;
color: white;
padding: 5px 10px;
border-radius: 4px;
font-weight: bold;
}}
.status-fail {{
background-color: #f44336;
color: white;
padding: 5px 10px;
border-radius: 4px;
font-weight: bold;
}}
.metric-type {{
background-color: #2196F3;
color: white;
padding: 3px 8px;
border-radius: 3px;
font-size: 12px;
}}
.regression {{
color: #f44336;
font-weight: bold;
}}
.improvement {{
color: #4CAF50;
font-weight: bold;
}}
.neutral {{
color: #666;
}}
.test-name {{
font-family: monospace;
font-size: 12px;
word-break: break-all;
}}
.footer {{
margin-top: 30px;
text-align: center;
color: #666;
font-size: 12px;
}}
</style>
</head>
<body>
<div class="container">
<h1>🔍 Backend Comparison Report: DEFAULT ({default_backend}) vs UCX</h1>
<div class="info">
<strong>DEFAULT Backend:</strong> {default_backend}
<br>
<strong>Comparison Backend:</strong> UCX
<br>
<strong>Threshold:</strong> {threshold}%
<br>
<strong>Description:</strong> Only focus on cases where DEFAULT is slower than UCX.
Mark as Fail if performance degradation exceeds threshold
</div>
<div class="warning-box">
<strong>⚠️ Attention:</strong>
<ul style="margin: 5px 0;">
<li>✅ <strong>Pass</strong>: DEFAULT is similar to or better than UCX</li>
<li>❌ <strong>Fail</strong>: DEFAULT is slower than UCX{threshold}%Performance degradation</li>
<li>📊 Positive value means DEFAULT is slower than UCX, negative value means
DEFAULT is faster than UCX</li>
</ul>
</div>
<div class="summary">
<div class="summary-box total">
<h2>{total}</h2>
<p>Total tests</p>
</div>
<div class="summary-box pass">
<h2>{passed}</h2>
<p>Pass</p>
</div>
<div class="summary-box fail">
<h2>{failed}</h2>
<p>Performance degradation</p>
</div>
</div>
<table>
<thead>
<tr>
<th style="width: 22%;">DEFAULT ({default_backend})</th>
<th style="width: 22%;">UCX</th>
<th style="width: 10%;">Metric type</th>
<th style="width: 10%;">DEFAULT value</th>
<th style="width: 10%;">UCX value</th>
<th style="width: 8%;">Difference (%)</th>
<th style="width: 10%;">Regression/Improvement (%)</th>
<th style="width: 8%;">Status</th>
</tr>
</thead>
<tbody>
{table_rows}
</tbody>
</table>
<div class="footer">
<p>Generated time: {timestamp}</p>
</div>
</div>
</body>
</html>
"""
# Generate table rows
table_rows = []
for _, row in result_df.iterrows():
status_class = "status-pass" if row["status"] == "Pass" else "status-fail"
# Format difference percentage
if pd.notna(row["diff_pct"]):
diff_str = f"{row['diff_pct']:.2f}%"
else:
diff_str = "N/A"
# Format regression/improvement percentage
if pd.notna(row["regression_pct"]):
if row["regression_pct"] > 0:
# Positive value: DEFAULT is slower than UCX (regression)
regression_str = f"+{row['regression_pct']:.2f}%"
regression_class = "regression"
else:
# Negative value: DEFAULT is faster than UCX (improvement)
regression_str = f"{row['regression_pct']:.2f}%"
regression_class = "improvement"
else:
regression_str = "N/A"
regression_class = "neutral"
# Format values
default_val = f"{row['default_value']:.2f}" if pd.notna(row["default_value"]) else "N/A"
ucx_val = f"{row['ucx_value']:.2f}" if pd.notna(row["ucx_value"]) else "N/A"
row_html = f"""
<tr>
<td class="test-name">{row["test_case_name_default"]}</td>
<td class="test-name">{row["test_case_name_ucx"]}</td>
<td><span class="metric-type">{row["metric_type"]}</span></td>
<td>{default_val}</td>
<td>{ucx_val}</td>
<td>{diff_str}</td>
<td class="{regression_class}">{regression_str}</td>
<td><span class="{status_class}">{row["status"]}</span></td>
</tr>
"""
table_rows.append(row_html)
# Fill template
from datetime import datetime
html_content = html_template.format(
default_backend=default_backend,
threshold=threshold,
total=total,
passed=passed,
failed=failed,
table_rows="".join(table_rows),
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
)
# Write to file
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_content)
def main():
parser = argparse.ArgumentParser(
description=(
"Compare performance test results between DEFAULT backend and UCX, "
"only focus on cases where DEFAULT is slower than UCX"
)
)
parser.add_argument(
"--csv-path", type=str, required=True, help="Performance test results CSV file path"
)
parser.add_argument(
"--threshold",
type=float,
default=5.0,
help=(
"Performance difference threshold (percentage), default 5.0%. "
"Only mark as Fail if DEFAULT is slower than UCX exceeds this threshold"
),
)
parser.add_argument(
"--default-backend",
type=str,
default="NIXL",
help="DEFAULT backend name (default NIXL, may switch to other backend in the future)",
)
parser.add_argument(
"--output", type=str, help="Output CSV file path (optional, default print to stdout)"
)
parser.add_argument("--html", type=str, help="Output HTML report file path (optional)")
args = parser.parse_args()
# Execute comparison
result_df = compare_backends(args.csv_path, args.threshold, args.default_backend)
# Output CSV results
if args.output:
result_df.to_csv(args.output, index=False)
print(f"CSV results saved to: {args.output}")
else:
print(result_df.to_string(index=False))
# Output HTML report
if args.html:
generate_html_report(result_df, args.threshold, args.default_backend, args.html)
print(f"HTML report saved to: {args.html}")
# Statistics
total = len(result_df)
failed = len(result_df[result_df["status"] == "Fail"])
passed = total - failed
print("\n============= Statistics =============")
print(f"DEFAULT Backend: {args.default_backend}")
print("Comparison Backend: UCX")
print(f"Threshold: {args.threshold}%")
print("-----------------------------------")
print(f"Total: {total}")
print(f"Pass: {passed} (DEFAULT performance normal)")
print(f"Fail: {failed} (DEFAULT is slower than UCX exceeds {args.threshold}%)")
print("===================================\n")
sys.exit(1 if failed > 0 else 0)
if __name__ == "__main__":
main()