#!/usr/bin/env python3 """Compare performance test results between different backends (UCX vs NIXL).""" import argparse import os import re import sys import pandas as pd def extract_backend(test_name): """Extract backend type from test_name. New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL Note: "DEFAULT" is a special marker that represents the default backend """ match = re.search(r"ccb-(\w+)", test_name) return match.group(1) if match else None def extract_base_case_name(test_name): """Extract standardized case name (remove backend information). Replace ccb-XXX with ccb-BACKEND to create a common base name for grouping. Example: disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-NIXL -> disagg_perf_deepseek-r1-fp4_1k1k_..._ccb-BACKEND """ # Replace ccb-XXX with ccb-BACKEND to normalize pattern = r"ccb-\w+" base_case = re.sub(pattern, "ccb-BACKEND", test_name) return base_case def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): """Compare performance metrics between DEFAULT backend and UCX. Only focus on cases where DEFAULT is slower than UCX. Args: csv_path: CSV file path threshold: Performance difference threshold (percentage) default_backend: DEFAULT backend name (currently NIXL, may switch in the future) Cases marked as "ccb-DEFAULT" will be treated as this backend Returns: DataFrame: Comparison results """ if not os.path.exists(csv_path): print(f"CSV file not found: {csv_path}") sys.exit(0) # Read CSV file if not os.path.exists(csv_path): print(f"CSV file not found: {csv_path}") sys.exit(0) df = pd.read_csv(csv_path) if len(df) == 0: print(f"No data found in CSV file: {csv_path}") sys.exit(0) # Filter only keep tests related to disagg_perf # Determine from test_name field (new format: disagg_perf_{model_name}_...) df = df[df["test_name"].str.contains("disagg_perf_", na=False)] if len(df) == 0: print(f"No disagg_perf tests found in CSV file: {csv_path}") sys.exit(0) # Extract backend and standardized case name df["backend"] = df["test_name"].apply(extract_backend) df["base_case_name"] = df["test_name"].apply(extract_base_case_name) # Normalize "DEFAULT" backend to the actual default_backend value # This allows cases marked as "ccb-DEFAULT" to be treated as the default backend df["backend"] = df["backend"].apply( lambda x: default_backend if x and x.upper() == "DEFAULT" else x ) # Group by base_case_name and metric_type grouped = df.groupby(["base_case_name", "metric_type"]) results = [] comparison_pairs = 0 single_backend_skipped = 0 for (base_case, metric_type), group in grouped: # Get DEFAULT backend and UCX data default_data = group[group["backend"] == default_backend] ucx_data = group[group["backend"] == "UCX"] # Skip if both have no data (this case may not exist) if len(default_data) == 0 and len(ucx_data) == 0: continue # Skip single-backend cases (only has one backend, not a comparison pair) # This happens when a test case only runs on one backend if len(default_data) == 0 or len(ucx_data) == 0: single_backend_skipped += 1 continue # This is a valid comparison pair comparison_pairs += 1 # Extract values and original test names default_value = default_data["perf_metric"].values[0] if len(default_data) > 0 else None default_original_name = ( default_data["network_name"].values[0] if len(default_data) > 0 else None ) ucx_value = ucx_data["perf_metric"].values[0] if len(ucx_data) > 0 else None ucx_original_name = ucx_data["network_name"].values[0] if len(ucx_data) > 0 else None # Determine status status = "Pass" diff_pct = None regression_pct = None # If one has value and the other has no value, mark as Fail (test run failed) if default_value is None or ucx_value is None: status = "Fail" elif ucx_value != 0: # Calculate performance difference percentage # For TTFT and E2EL metrics, smaller is better # regression_pct > 0 means DEFAULT is slower than UCX (performance degradation) # regression_pct < 0 means DEFAULT is faster than UCX (performance improvement) regression_pct = ((default_value - ucx_value) / ucx_value) * 100 diff_pct = abs(regression_pct) # Only fail if DEFAULT is slower than UCX and exceeds threshold if regression_pct > threshold: status = "Fail" else: status = "Pass" else: # UCX value is 0 is an abnormal case if default_value != 0: status = "Fail" # Use original network names, or "N/A" if data doesn't exist test_case_name_default = default_original_name if default_original_name else "N/A" test_case_name_ucx = ucx_original_name if ucx_original_name else "N/A" results.append( { "test_case_name_default": test_case_name_default, "test_case_name_ucx": test_case_name_ucx, "metric_type": metric_type, "default_value": default_value, "ucx_value": ucx_value, "diff_pct": diff_pct, "regression_pct": regression_pct, "status": status, } ) # Print statistics print("\n=== Backend Comparison Statistics ===") print(f"Default backend: {default_backend}") print(f"Comparison pairs: {comparison_pairs}") print(f"Single-backend cases (skipped): {single_backend_skipped}") print("=" * 37) # If no comparison pairs found, exit with success if comparison_pairs == 0: print("\nInfo: No backend comparison pairs found in disagg_perf tests") print("All cases are single-backend only, no comparison needed") sys.exit(0) # Convert to DataFrame result_df = pd.DataFrame(results) return result_df def generate_html_report(result_df, threshold, default_backend, output_path): """Generate HTML format comparison report.""" # Statistics total = len(result_df) failed = len(result_df[result_df["status"] == "Fail"]) passed = total - failed # HTML template html_template = """
Total tests
Pass
Performance degradation
| DEFAULT ({default_backend}) | UCX | Metric type | DEFAULT value | UCX value | Difference (%) | Regression/Improvement (%) | Status |
|---|