#!/usr/bin/env python3 """Compare performance test results between different backends (UCX vs NIXL).""" import argparse import os import re import sys import pandas as pd def extract_backend(test_name): """Extract backend type from test_name. New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL Note: "DEFAULT" is a special marker that represents the default backend """ match = re.search(r"ccb-(\w+)", test_name) return match.group(1) if match else None def extract_base_case_name(test_name): """Extract standardized case name (remove backend information). Replace ccb-XXX with ccb-BACKEND to create a common base name for grouping. Example: disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-NIXL -> disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-BACKEND """ # Replace ccb-XXX with ccb-BACKEND to normalize pattern = r"ccb-\w+" base_case = re.sub(pattern, "ccb-BACKEND", test_name) return base_case def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): """Compare performance metrics between DEFAULT backend and UCX. Only focus on cases where DEFAULT is slower than UCX. Args: csv_path: CSV file path threshold: Performance difference threshold (percentage) default_backend: DEFAULT backend name (currently NIXL, may switch in the future) Cases marked as "ccb-DEFAULT" will be treated as this backend Returns: DataFrame: Comparison results """ if not os.path.exists(csv_path): print(f"CSV file not found: {csv_path}") sys.exit(0) # Read CSV file if not os.path.exists(csv_path): print(f"CSV file not found: {csv_path}") sys.exit(0) df = pd.read_csv(csv_path) if len(df) == 0: print(f"No data found in CSV file: {csv_path}") sys.exit(0) # Filter only keep tests related to disagg_perf # Determine from test_name field (new format: disagg_perf_{model_name}_...) df = df[df["test_name"].str.contains("disagg_perf_", na=False)] if len(df) == 0: print(f"No disagg_perf tests found in CSV file: {csv_path}") sys.exit(0) # Extract backend and standardized case name df["backend"] = df["test_name"].apply(extract_backend) df["base_case_name"] = df["test_name"].apply(extract_base_case_name) # Normalize "DEFAULT" backend to the actual default_backend value # This allows cases marked as "ccb-DEFAULT" to be treated as the default backend df["backend"] = df["backend"].apply( lambda x: default_backend if x and x.upper() == "DEFAULT" else x ) # Group by base_case_name and metric_type grouped = df.groupby(["base_case_name", "metric_type"]) results = [] comparison_pairs = 0 single_backend_skipped = 0 for (base_case, metric_type), group in grouped: # Get DEFAULT backend and UCX data default_data = group[group["backend"] == default_backend] ucx_data = group[group["backend"] == "UCX"] # Skip if both have no data (this case may not exist) if len(default_data) == 0 and len(ucx_data) == 0: continue # Skip single-backend cases (only has one backend, not a comparison pair) # This happens when a test case only runs on one backend if len(default_data) == 0 or len(ucx_data) == 0: single_backend_skipped += 1 continue # This is a valid comparison pair comparison_pairs += 1 # Extract values and original test names default_value = default_data["perf_metric"].values[0] if len(default_data) > 0 else None default_original_name = ( default_data["network_name"].values[0] if len(default_data) > 0 else None ) ucx_value = ucx_data["perf_metric"].values[0] if len(ucx_data) > 0 else None ucx_original_name = ucx_data["network_name"].values[0] if len(ucx_data) > 0 else None # Determine status status = "Pass" diff_pct = None regression_pct = None # If one has value and the other has no value, mark as Fail (test run failed) if default_value is None or ucx_value is None: status = "Fail" elif ucx_value != 0: # Calculate performance difference percentage # For TTFT and E2EL metrics, smaller is better # regression_pct > 0 means DEFAULT is slower than UCX (performance degradation) # regression_pct < 0 means DEFAULT is faster than UCX (performance improvement) regression_pct = ((default_value - ucx_value) / ucx_value) * 100 diff_pct = abs(regression_pct) # Only fail if DEFAULT is slower than UCX and exceeds threshold if regression_pct > threshold: status = "Fail" else: status = "Pass" else: # UCX value is 0 is an abnormal case if default_value != 0: status = "Fail" # Use original network names, or "N/A" if data doesn't exist test_case_name_default = default_original_name if default_original_name else "N/A" test_case_name_ucx = ucx_original_name if ucx_original_name else "N/A" results.append( { "test_case_name_default": test_case_name_default, "test_case_name_ucx": test_case_name_ucx, "metric_type": metric_type, "default_value": default_value, "ucx_value": ucx_value, "diff_pct": diff_pct, "regression_pct": regression_pct, "status": status, } ) # Print statistics print("\n=== Backend Comparison Statistics ===") print(f"Default backend: {default_backend}") print(f"Comparison pairs: {comparison_pairs}") print(f"Single-backend cases (skipped): {single_backend_skipped}") print("=" * 37) # If no comparison pairs found, exit with success if comparison_pairs == 0: print("\nInfo: No backend comparison pairs found in disagg_perf tests") print("All cases are single-backend only, no comparison needed") sys.exit(0) # Convert to DataFrame result_df = pd.DataFrame(results) return result_df def generate_html_report(result_df, threshold, default_backend, output_path): """Generate HTML format comparison report.""" # Statistics total = len(result_df) failed = len(result_df[result_df["status"] == "Fail"]) passed = total - failed # HTML template html_template = """ Backend Comparison Report - DEFAULT vs UCX

🔍 Backend Comparison Report: DEFAULT ({default_backend}) vs UCX

DEFAULT Backend: {default_backend}
Comparison Backend: UCX
Threshold: {threshold}%
Description: Only focus on cases where DEFAULT is slower than UCX. Mark as Fail if performance degradation exceeds threshold
⚠️ Attention:

{total}

Total tests

{passed}

Pass

{failed}

Performance degradation

{table_rows}
DEFAULT ({default_backend}) UCX Metric type DEFAULT value UCX value Difference (%) Regression/Improvement (%) Status
""" # Generate table rows table_rows = [] for _, row in result_df.iterrows(): status_class = "status-pass" if row["status"] == "Pass" else "status-fail" # Format difference percentage if pd.notna(row["diff_pct"]): diff_str = f"{row['diff_pct']:.2f}%" else: diff_str = "N/A" # Format regression/improvement percentage if pd.notna(row["regression_pct"]): if row["regression_pct"] > 0: # Positive value: DEFAULT is slower than UCX (regression) regression_str = f"+{row['regression_pct']:.2f}%" regression_class = "regression" else: # Negative value: DEFAULT is faster than UCX (improvement) regression_str = f"{row['regression_pct']:.2f}%" regression_class = "improvement" else: regression_str = "N/A" regression_class = "neutral" # Format values default_val = f"{row['default_value']:.2f}" if pd.notna(row["default_value"]) else "N/A" ucx_val = f"{row['ucx_value']:.2f}" if pd.notna(row["ucx_value"]) else "N/A" row_html = f""" {row["test_case_name_default"]} {row["test_case_name_ucx"]} {row["metric_type"]} {default_val} {ucx_val} {diff_str} {regression_str} {row["status"]} """ table_rows.append(row_html) # Fill template from datetime import datetime html_content = html_template.format( default_backend=default_backend, threshold=threshold, total=total, passed=passed, failed=failed, table_rows="".join(table_rows), timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ) # Write to file with open(output_path, "w", encoding="utf-8") as f: f.write(html_content) def main(): parser = argparse.ArgumentParser( description=( "Compare performance test results between DEFAULT backend and UCX, " "only focus on cases where DEFAULT is slower than UCX" ) ) parser.add_argument( "--csv-path", type=str, required=True, help="Performance test results CSV file path" ) parser.add_argument( "--threshold", type=float, default=5.0, help=( "Performance difference threshold (percentage), default 5.0%. " "Only mark as Fail if DEFAULT is slower than UCX exceeds this threshold" ), ) parser.add_argument( "--default-backend", type=str, default="NIXL", help="DEFAULT backend name (default NIXL, may switch to other backend in the future)", ) parser.add_argument( "--output", type=str, help="Output CSV file path (optional, default print to stdout)" ) parser.add_argument("--html", type=str, help="Output HTML report file path (optional)") args = parser.parse_args() # Execute comparison result_df = compare_backends(args.csv_path, args.threshold, args.default_backend) # Output CSV results if args.output: result_df.to_csv(args.output, index=False) print(f"CSV results saved to: {args.output}") else: print(result_df.to_string(index=False)) # Output HTML report if args.html: generate_html_report(result_df, args.threshold, args.default_backend, args.html) print(f"HTML report saved to: {args.html}") # Statistics total = len(result_df) failed = len(result_df[result_df["status"] == "Fail"]) passed = total - failed print("\n============= Statistics =============") print(f"DEFAULT Backend: {args.default_backend}") print("Comparison Backend: UCX") print(f"Threshold: {args.threshold}%") print("-----------------------------------") print(f"Total: {total}") print(f"Pass: {passed} (DEFAULT performance normal)") print(f"Fail: {failed} (DEFAULT is slower than UCX exceeds {args.threshold}%)") print("===================================\n") sys.exit(1 if failed > 0 else 0) if __name__ == "__main__": main()