TensorRT-LLMs/tests/scripts/perf-sanity/parse_benchmark_results.py
chenfeiz0326 6cf1c3fba4
[TRTLLM-8260][feat] Add Server-Client Perf Test in pytest for B200 and B300 (#7985)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
2025-10-22 10:17:22 +08:00

245 lines
9.4 KiB
Python

#!/usr/bin/env python3
import argparse
import re
import sys
from pathlib import Path
class PerfMetrics:
"""Class to store and parse performance metrics from benchmark logs"""
def __init__(self):
# Basic metrics
self.total_requests = 0
self.successful_requests = 0
self.failed_requests = 0
self.benchmark_duration = 0.0
self.total_input_tokens = 0
self.total_generated_tokens = 0
self.request_throughput = 0.0
self.output_token_throughput = 0.0
self.total_token_throughput = 0.0
self.user_throughput = 0.0
self.avg_decoded_tokens_per_iter = 0.0
# Time to First Token (TTFT)
self.mean_ttft_ms = 0.0
self.median_ttft_ms = 0.0
self.p99_ttft_ms = 0.0
# Time per Output Token (TPOT)
self.mean_tpot_ms = 0.0
self.median_tpot_ms = 0.0
self.p99_tpot_ms = 0.0
# Inter-token Latency (ITL)
self.mean_itl_ms = 0.0
self.median_itl_ms = 0.0
self.p99_itl_ms = 0.0
# End-to-end Latency (E2EL)
self.mean_e2el_ms = 0.0
self.median_e2el_ms = 0.0
self.p99_e2el_ms = 0.0
def to_str(self) -> str:
return f"Total Requests: {self.total_requests}, Successful Requests: {self.successful_requests}, Failed Requests: {self.failed_requests}, Benchmark Duration (s): {self.benchmark_duration}, Total Input Tokens: {self.total_input_tokens}, Total Generated Tokens: {self.total_generated_tokens}, Request Throughput (req/s): {self.request_throughput}, Output Token Throughput (tok/s): {self.output_token_throughput}, Total Token Throughput (tok/s): {self.total_token_throughput}, User Throughput (tok/s): {self.user_throughput}, Avg Decoded Tokens per Iter: {self.avg_decoded_tokens_per_iter}, Mean TTFT (ms): {self.mean_ttft_ms}, Median TTFT (ms): {self.median_ttft_ms}, P99 TTFT (ms): {self.p99_ttft_ms}, Mean TPOT (ms): {self.mean_tpot_ms}, Median TPOT (ms): {self.median_tpot_ms}, P99 TPOT (ms): {self.p99_tpot_ms}, Mean ITL (ms): {self.mean_itl_ms}, Median ITL (ms): {self.median_itl_ms}, P99 ITL (ms): {self.p99_itl_ms}, Mean E2EL (ms): {self.mean_e2el_ms}, Median E2EL (ms): {self.median_e2el_ms}, P99 E2EL (ms): {self.p99_e2el_ms}"
@classmethod
def from_log_content(cls, log_content: str) -> 'PerfMetrics':
"""Parse performance metrics from log content"""
metrics = cls()
# Define patterns for each metric
patterns = {
'total_requests': r'Total requests:\s+(\d+)',
'successful_requests': r'Successful requests:\s+(\d+)',
'failed_requests': r'Failed requests:\s+(\d+)',
'benchmark_duration': r'Benchmark duration \(s\):\s+([\d.]+)',
'total_input_tokens': r'Total input tokens:\s+(\d+)',
'total_generated_tokens': r'Total generated tokens:\s+(\d+)',
'request_throughput': r'Request throughput \(req/s\):\s+([\d.]+)',
'output_token_throughput':
r'Output token throughput \(tok/s\):\s+([\d.]+)',
'total_token_throughput':
r'Total Token throughput \(tok/s\):\s+([\d.]+)',
'user_throughput': r'User throughput \(tok/s\):\s+([\d.]+)',
'avg_decoded_tokens_per_iter':
r'Avg Decoded Tokens per Iter:\s+([\d.]+)',
'mean_ttft_ms': r'Mean TTFT \(ms\):\s+([\d.]+)',
'median_ttft_ms': r'Median TTFT \(ms\):\s+([\d.]+)',
'p99_ttft_ms': r'P99 TTFT \(ms\):\s+([\d.]+)',
'mean_tpot_ms': r'Mean TPOT \(ms\):\s+([\d.]+)',
'median_tpot_ms': r'Median TPOT \(ms\):\s+([\d.]+)',
'p99_tpot_ms': r'P99 TPOT \(ms\):\s+([\d.]+)',
'mean_itl_ms': r'Mean ITL \(ms\):\s+([\d.]+)',
'median_itl_ms': r'Median ITL \(ms\):\s+([\d.]+)',
'p99_itl_ms': r'P99 ITL \(ms\):\s+([\d.]+)',
'mean_e2el_ms': r'Mean E2EL \(ms\):\s+([\d.]+)',
'median_e2el_ms': r'Median E2EL \(ms\):\s+([\d.]+)',
'p99_e2el_ms': r'P99 E2EL \(ms\):\s+([\d.]+)',
}
# Parse each metric
for attr_name, pattern in patterns.items():
match = re.search(pattern, log_content)
if match:
value = match.group(1)
try:
if '.' in value:
setattr(metrics, attr_name, float(value))
else:
setattr(metrics, attr_name, int(value))
except ValueError:
# Keep default value if parsing fails
pass
return metrics
def extract_server_and_client_name_from_log(log_file):
"""
Extract server name, client name, and performance metrics from log file.
Looks for pattern: Server-Config: <server_name>-<client_name>
"""
try:
with open(log_file, 'r') as f:
content = f.read()
# Look for Server-Config pattern
server_config_match = re.search(r'Server-Config:\s*(\S+)', content)
if not server_config_match:
print(
f"Warning: Could not find 'Server-Config:' pattern in {log_file}"
)
return None, None, None
# Extract the full config name
config_name = server_config_match.group(1)
# Split on the last '-' to separate server and client names
# Format: <server_name>-<client_name>
parts = config_name.rsplit('-', 1)
if len(parts) != 2:
print(
f"Warning: Invalid Server-Config format in {log_file}: {config_name}"
)
return None, None, None
server_name = parts[0]
client_name = parts[1]
# Extract PerfMetrics
perf_metrics = PerfMetrics.from_log_content(content)
return server_name, client_name, perf_metrics
except Exception as e:
print(f"Warning: Could not read {log_file}: {e}")
return None, None, None
def parse_benchmark_results(log_folder):
"""
Parse benchmark results from log files and print grouped by server and client names
"""
log_folder = Path(log_folder)
# Validate inputs
if not log_folder.exists():
print(f"Error: Input folder '{log_folder}' does not exist")
return
if not log_folder.is_dir():
print(f"Error: '{log_folder}' is not a directory")
return
# Find all trtllm-benchmark.*.log files
log_files = list(log_folder.glob("trtllm-benchmark.*.log"))
print(f"Found {len(log_files)} log files to process")
# Dictionary to group results by server name and client name
# Structure: {server_name: {client_name: perf_metrics}}
results_by_server = {}
# Process each log file
parsed_count = 0
for log_file in log_files:
# Extract server name, client name, and PerfMetrics from log
server_name, client_name, perf_metrics = extract_server_and_client_name_from_log(
log_file)
if not server_name or not client_name or not perf_metrics:
continue
parsed_count += 1
# Group results by server name and client name
if server_name not in results_by_server:
results_by_server[server_name] = {}
results_by_server[server_name][client_name] = perf_metrics
print(f"Successfully parsed {parsed_count} log files\n")
# Print grouped results
print_grouped_results(results_by_server)
def print_grouped_results(results_by_server):
"""
Print benchmark results grouped by server name and client name
"""
print("=" * 100)
# Sort server names for consistent output
for server_name in sorted(results_by_server.keys()):
print(f"Server Name: {server_name}")
# Sort client names for consistent output
for client_name in sorted(results_by_server[server_name].keys()):
perf_metrics = results_by_server[server_name][client_name]
print(f"Client Name: {client_name}")
print(
f"Benchmark duration (s): {perf_metrics.benchmark_duration:.2f} "
f"Request throughput (req/s): {perf_metrics.request_throughput:.2f} "
f"Output token throughput (tok/s): {perf_metrics.output_token_throughput:.2f} "
f"Total Token throughput (tok/s): {perf_metrics.total_token_throughput:.2f} "
f"User throughput (tok/s): {perf_metrics.user_throughput:.2f} "
f"Mean TTFT (ms): {perf_metrics.mean_ttft_ms:.2f} "
f"Median TTFT (ms): {perf_metrics.median_ttft_ms:.2f} "
f"P99 TTFT (ms): {perf_metrics.p99_ttft_ms:.2f}")
print("=" * 100)
def main():
parser = argparse.ArgumentParser(
description=
"Script to parse benchmark metrics from log files and print grouped by server and client names",
epilog=
"Example: python parse_benchmark_results.py --log_folder ./benchmark_logs"
)
parser.add_argument(
"--log_folder",
required=True,
help="Folder containing benchmark log files (trtllm-benchmark.*.log)")
args = parser.parse_args()
# Validate inputs
log_folder_path = Path(args.log_folder)
if not log_folder_path.exists():
print(f"Error: Input folder '{args.log_folder}' not found.")
sys.exit(1)
if not log_folder_path.is_dir():
print(f"Error: '{args.log_folder}' is not a directory.")
sys.exit(1)
parse_benchmark_results(args.log_folder)
if __name__ == "__main__":
main()