#!/usr/bin/env python3 import argparse import ast import os import subprocess import sys from pathlib import Path from typing import Dict, List, NamedTuple import yaml from test_common.http_utils import wait_for_endpoint_ready def get_node_name() -> str: """Get the current node name""" try: result = subprocess.run("hostname", shell=True, capture_output=True, text=True, check=True) return result.stdout.strip() except (subprocess.CalledProcessError, FileNotFoundError): return "unknown" def get_gpu_info() -> str: """Get GPU information from nvidia-smi""" try: result = subprocess.run("nvidia-smi", shell=True, capture_output=True, text=True, check=True) return result.stdout except subprocess.CalledProcessError as e: return f"nvidia-smi failed with error code {e.returncode}\nError output: {e.stderr}" except FileNotFoundError: return "nvidia-smi not found" except Exception as e: return f"Failed to get GPU information: {e}" # Model PATH of local dir synced from internal LLM models repo MODEL_PATH_DICT = { "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf", # not safetensors repo "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf", # not safetensors repo "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf", # not safetensors repo "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B", "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", "llama_v3.1_8b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4", "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B", "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8", "llama_v3.3_70b_instruct_fp8": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", "llama_v3.3_70b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_405b_instruct_fp8": "llama-3.1-model/Llama-3.1-405B-Instruct-FP8", "llama_v3.1_405b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4", "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct", "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B", "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1", "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8", "llama_v3.3_nemotron_super_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", "llama_v3.3_nemotron_super_49b_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8", "llama_v3.1_nemotron_ultra_253b": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1", "llama_v3.1_nemotron_ultra_253b_fp8": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", "llama_v4_scout_17b_16e_instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct", "llama_v4_scout_17b_16e_instruct_fp8": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8", "llama_v4_scout_17b_16e_instruct_fp4": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4", "llama_v4_maverick_17b_128e_instruct": "llama4-models/Llama-4-Maverick-17B-128E-Instruct", "llama_v4_maverick_17b_128e_instruct_fp8": "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1", "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1", "mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8", "mixtral_8x7b_v0.1_instruct_fp4": "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4", "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407", "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1", "mistral_7b_v0.1": "mistral-7b-v0.1", "ministral_8b": "Ministral-8B-Instruct-2410", "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8", "gemma_3_1b_it": "gemma/gemma-3-1b-it", "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", "qwen2_7b_instruct": "Qwen2-7B-Instruct", "qwen_14b_chat": "Qwen-14B-Chat", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", "starcoder2_3b": "starcoder2-3b", "starcoder_15b": "starcoder2-15b", "t5": "t5-small", # not supported for trtllm-bench build config "flan_t5_base": "flan-t5-small", # not supported for trtllm-bench build config "flan_t5_large": "flan-t5-xl", # not supported for trtllm-bench build config "whisper_large_v3": "whisper-models/large-v3", # not supported for trtllm-bench tokenizer "bart_large_cnn": "bart-large-cnn", # not safetensors repo "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt", "mamba_130m": "mamba/mamba-130m-hf", "mamba_370m": "mamba/mamba-370m-hf", "mamba_2.8b": "mamba/mamba-2.8b-hf", "gpt_20b": "gpt-neox-20b", "gpt_350m_moe": "gpt2-medium", "phi_3_mini_4k_instruct": "Phi-3/Phi-3-mini-4k-instruct", "phi_3_mini_128k_instruct": "Phi-3/Phi-3-mini-128k-instruct", "phi_4_mini_instruct": "Phi-4-mini-instruct", "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct", "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct", "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8", "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", } # Model PATH of HuggingFace HF_MODEL_PATH = { "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf", "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf", "falcon_180b_hf": "tiiuae/falcon-180B", "gptj_6b_hf": "EleutherAI/gpt-j-6b", "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B", "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B", "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8", "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct", "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B", "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B", "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B", "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", "llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8", "llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", "llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1", "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1", "ministral_8b_hf": "mistralai/Ministral-8B-Instruct-2410", "flan_t5_base_hf": "google/flan-t5-small", "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct", "gemma_3_1b_it_hf": "google/gemma-3-1b-it", } LLM_MODELS_ROOT = os.environ.get('LLM_MODELS_ROOT', '/home/scratch.trt_llm_data/llm-models') # Model path mapping def llm_models_root(): return LLM_MODELS_ROOT def get_model_dir(model_name: str) -> str: model_dir = "" if model_name in MODEL_PATH_DICT.keys(): model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name]) elif model_name in HF_MODEL_PATH.keys(): model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name.split('_hf')[0]]) return model_dir def str_to_bool(value: str) -> bool: return ast.literal_eval(value) # {metric_name: (is_optional, type)} SERVER_CONFIG_METRICS = { "model_name": (False, str), "tp": (False, int), "ep": (False, int), "pp": (True, int), "isl": (False, int), "osl": (False, int), "max_num_tokens": (False, int), "enable_chunked_prefill": (True, str_to_bool), "disable_overlap_scheduler": (True, str_to_bool), "attention_backend": (False, str), "moe_backend": (True, str), "moe_max_num_tokens": (True, str), "stream_interval": (True, int), "enable_attention_dp": (True, str_to_bool), "attention_dp_balance": (True, str_to_bool), "batching_wait_iters": (True, int), "timeout_iters": (True, int), "kv_cache_dtype": (True, str), "enable_block_reuse": (True, str_to_bool), "free_gpu_memory_fraction": (True, float), "max_batch_size": (False, int), "enable_padding": (True, str_to_bool), } # Nested under 'speculative_config' in YAML SPECULATIVE_CONFIG_METRICS = { "decoding_type": (True, str), "max_draft_len": (True, int), "speculative_model_dir": (True, str), "eagle3_one_model": (True, str_to_bool), } CLIENT_CONFIG_METRICS = { "concurrency": (False, int), "iterations": (False, int), "random_range_ratio": (True, float), } class ServerConfig: """ Configurations of trtllm-server. """ def __init__( self, name: str, model_name: str, tp: int, ep: int, max_num_tokens: int, attention_backend: str, max_batch_size: int, pp: int = 1, enable_chunked_prefill: bool = False, disable_overlap_scheduler: bool = False, moe_backend: str = "", moe_max_num_tokens: str = "", stream_interval: int = 10, enable_attention_dp: bool = False, attention_dp_balance: bool = False, batching_wait_iters: int = 10, timeout_iters: int = 50, kv_cache_dtype: str = "fp8", enable_block_reuse: bool = False, free_gpu_memory_fraction: float = 0.8, enable_padding: bool = True, decoding_type: str = "", max_draft_len: int = 0, speculative_model_dir: str = "", eagle3_one_model: bool = False, ): self.name = name self.model_name = model_name self.tp = tp self.ep = ep self.pp = pp self.max_num_tokens = max_num_tokens self.enable_chunked_prefill = enable_chunked_prefill self.disable_overlap_scheduler = disable_overlap_scheduler self.attention_backend = attention_backend self.moe_backend = moe_backend self.moe_max_num_tokens = moe_max_num_tokens self.stream_interval = stream_interval self.enable_attention_dp = enable_attention_dp self.attention_dp_balance = attention_dp_balance self.batching_wait_iters = batching_wait_iters self.timeout_iters = timeout_iters self.kv_cache_dtype = kv_cache_dtype self.enable_block_reuse = enable_block_reuse self.free_gpu_memory_fraction = free_gpu_memory_fraction self.max_batch_size = max_batch_size self.enable_padding = enable_padding self.decoding_type = decoding_type self.max_draft_len = max_draft_len self.speculative_model_dir = speculative_model_dir self.eagle3_one_model = eagle3_one_model model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name def to_cmd(self, working_dir: str) -> List[str]: config_path = os.path.join(working_dir, f"extra-llm-api-config.{self.name}.yml") return [ "trtllm-serve", self.model_path, "--host", "localhost", "--port", "8000", "--backend", "pytorch", "--extra_llm_api_options", config_path ] def generate_extra_llm_api_config(self) -> str: """Generate extra-llm-api-config.yml content""" config_lines = [ f"tensor_parallel_size: {self.tp}", f"moe_expert_parallel_size: {self.ep}", f"pipeline_parallel_size: {self.pp}", f"max_num_tokens: {self.max_num_tokens}", f"max_batch_size: {self.max_batch_size}", f"enable_attention_dp: {str(self.enable_attention_dp).lower()}", f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}", f"stream_interval: {self.stream_interval}", f"attn_backend: {self.attention_backend}", f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}", "cuda_graph_config:", f" enable_padding: {str(self.enable_padding).lower()}", f" max_batch_size: {self.max_batch_size}", "kv_cache_config:", f" dtype: {self.kv_cache_dtype}", f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}", f" enable_block_reuse: {str(self.enable_block_reuse).lower()}", "print_iter_log: false", ] # Add moe_config if moe_backend is specified if self.moe_backend: config_lines.append("moe_config:") config_lines.append(f" backend: {self.moe_backend}") if self.moe_max_num_tokens: config_lines.append( f" max_num_tokens: {self.moe_max_num_tokens}") if self.attention_dp_balance: config_lines.append("attention_dp_balance:") config_lines.append(" enable_balance: true") config_lines.append( f" batching_wait_iters: {self.batching_wait_iters}") config_lines.append(f" timeout_iters: {self.timeout_iters}") # Add speculative_config if decoding_type is specified if self.decoding_type: config_lines.append("speculative_config:") config_lines.append(f" decoding_type: {self.decoding_type}") if self.max_draft_len > 0: config_lines.append(f" max_draft_len: {self.max_draft_len}") if self.speculative_model_dir: config_lines.append( f" speculative_model_dir: {self.speculative_model_dir}") if self.eagle3_one_model: config_lines.append( f" eagle3_one_model: {str(self.eagle3_one_model).lower()}") return "\n".join(config_lines) class ClientConfig: """ Configurations of benchmark client. """ def __init__(self, name: str, model_name: str, concurrency: int, iterations: int, isl: int, osl: int, random_range_ratio: float = 0.0): self.name = name self.model_name = model_name self.concurrency = concurrency self.iterations = iterations self.isl = isl self.osl = osl self.random_range_ratio = random_range_ratio self.model_path = "" def to_cmd(self, working_dir: str) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name return [ "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving", "--model", self.model_path, "--dataset-name", "random", "--random-ids", "--num-prompts", str(self.concurrency * self.iterations), "--random-input-len", str(self.isl), "--random-output-len", str(self.osl), "--random-range-ratio", str(self.random_range_ratio), "--ignore-eos", "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", str(self.concurrency) ] def parse_select_pattern(select_pattern: str): """Parse select pattern like 'r1_fp4_dep4,r1_fp4_tep4:con1_iter1_1024_1024,r1_fp4_tep4:con8_iter1_1024_1024' Format: - ',' splits different server configs - ':' means for this server, we choose specific clients - If no ':', all clients are chosen for that server Returns: - Dict with server name as key and either None (all clients) or set of client names as value """ execution_plan = {} parts = select_pattern.split(',') for part in parts: part = part.strip() if not part: # Skip empty parts continue if ':' in part: # Format: "server_name:client_name" server_name, client_name = part.split(':', 1) server_name = server_name.strip() client_name = client_name.strip() # Only add if not already set to None (all clients) if server_name not in execution_plan: execution_plan[server_name] = set() if execution_plan[server_name] is not None: execution_plan[server_name].add(client_name) else: # Format: "server_name" - select all clients for this server server_name = part.strip() execution_plan[server_name] = None return execution_plan def parse_config_file(config_file_path: str, select_pattern: str = None): """Parse YAML configuration file and create ServerConfig and ClientConfig objects Args: config_file_path: Path to YAML configuration file select_pattern: Selection pattern string (e.g., "r1_fp4_dep4,r1_fp4_tep4:con1_iter1_1024_1024") Returns: execution_plan: None (all servers/clients) or dict with server names as keys server_configs: List of ServerConfig objects server_client_configs: Dict with server id as key and list of ClientConfig as value """ # Parse selection pattern if select_pattern: execution_plan = parse_select_pattern(select_pattern) else: execution_plan = None # Read YAML config file with open(config_file_path, 'r') as f: config = yaml.safe_load(f) server_configs = [] server_client_configs = {} for server_config_data in config['server_configs']: server_name = server_config_data['name'] # Check if this server should be included based on execution_plan if execution_plan is not None and server_name not in execution_plan: continue # Create ServerConfig object server_config = ServerConfig( name=server_config_data['name'], model_name=server_config_data['model_name'], tp=server_config_data['tp'], ep=server_config_data['ep'], pp=server_config_data.get('pp', 1), attention_backend=server_config_data.get('attention_backend', 'TRTLLM'), moe_backend=server_config_data.get('moe_backend', ''), moe_max_num_tokens=server_config_data.get('moe_max_num_tokens', ''), enable_attention_dp=server_config_data.get('enable_attention_dp', False), attention_dp_balance=server_config_data.get('attention_dp_balance', False), batching_wait_iters=server_config_data.get('batching_wait_iters', 10), timeout_iters=server_config_data.get('timeout_iters', 50), enable_chunked_prefill=server_config_data.get( 'enable_chunked_prefill', False), max_num_tokens=server_config_data.get('max_num_tokens', 2048), disable_overlap_scheduler=server_config_data.get( 'disable_overlap_scheduler', False), kv_cache_dtype=server_config_data.get('kv_cache_dtype', 'fp8'), enable_block_reuse=server_config_data.get('enable_block_reuse', False), free_gpu_memory_fraction=server_config_data.get( 'free_gpu_memory_fraction', 0.8), max_batch_size=server_config_data.get('max_batch_size', 256), enable_padding=server_config_data.get('enable_padding', True), decoding_type=server_config_data.get('speculative_config', {}).get('decoding_type', ''), max_draft_len=server_config_data.get('speculative_config', {}).get('max_draft_len', 0), speculative_model_dir=server_config_data.get( 'speculative_config', {}).get('speculative_model_dir', ''), eagle3_one_model=server_config_data.get( 'speculative_config', {}).get('eagle3_one_model', False)) server_id = len(server_configs) server_configs.append(server_config) # Create ClientConfig objects client_configs = [] selected_client_names = execution_plan.get( server_name) if execution_plan else None for client_config_data in server_config_data['client_configs']: client_name = client_config_data['name'] # Check if this client should be included # Include if: execution_plan is None OR selected_client_names is None OR client_name in selected_client_names if execution_plan is not None and selected_client_names is not None: if client_name not in selected_client_names: continue client_config = ClientConfig( name=client_config_data['name'], model_name=server_config_data['model_name'], concurrency=client_config_data['concurrency'], iterations=client_config_data.get('iterations', 1), isl=client_config_data.get('isl', 1024), osl=client_config_data.get('osl', 1024), random_range_ratio=client_config_data.get( 'random_range_ratio', 0.0)) client_configs.append(client_config) server_client_configs[server_id] = client_configs return execution_plan, server_configs, server_client_configs def get_trtllm_server_client_commands( server_configs: List[ServerConfig], server_client_configs: Dict[int, List[ClientConfig]], working_dir: str): server_cmds = [] client_cmds = [] names = [] for server_idx, client_configs in server_client_configs.items(): server_config = server_configs[server_idx] server_cmd = server_config.to_cmd(working_dir) # Generate extra-llm-api-config.yml config_content = server_config.generate_extra_llm_api_config() config_filename = f"extra-llm-api-config.{server_config.name}.yml" config_path = os.path.join(working_dir, config_filename) with open(config_path, 'w') as f: f.write(config_content) for client_config in client_configs: server_cmds.append(server_cmd) client_cmd = client_config.to_cmd(working_dir) client_cmds.append(client_cmd) names.append(f"{server_config.name}-{client_config.name}") return server_cmds, client_cmds, names class PerfServerBenchmarkCmds(NamedTuple): server_cmds: List[List[str]] client_cmds: List[List[str]] names: List[str] working_dir: str def run_cmd(self, cmd_idx: int, node_name: str, gpu_info: str, max_timeout: int = 5400) -> str: output = "" server_file_path = os.path.join( self.working_dir, f"trtllm-serve.{self.names[cmd_idx]}.log") client_file_path = os.path.join( self.working_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log") server_proc = None try: # Run server command with open(server_file_path, 'w') as server_ctx: server_proc = subprocess.Popen(self.server_cmds[cmd_idx], stdout=server_ctx, stderr=subprocess.STDOUT) # Wait for server to be ready wait_for_endpoint_ready("http://localhost:8000/v1/models", timeout=max_timeout) # Save node name, gpu info, server config, client config output to server file path with open(client_file_path, 'w') as client_ctx: client_ctx.write(f"Node: {node_name}\n") client_ctx.write(f"GPU Info: {gpu_info}\n") client_ctx.write(f"Server-Config: {self.names[cmd_idx]}\n") # Run client command subprocess.run(self.client_cmds[cmd_idx], stdout=client_ctx, stderr=subprocess.STDOUT, check=True) finally: server_proc.terminate() server_proc.wait() return output def get_cmd_str(self, cmd_idx) -> List[str]: return ["server-benchmark tests, please check config files"] def run_perf_tests(server_configs: List[ServerConfig], server_client_configs: Dict[int, List[ClientConfig]], max_timeout: int, working_dir: str, node_name: str, gpu_info: str) -> None: """Main function to run all benchmarks from config file""" server_cmds, client_cmds, names = get_trtllm_server_client_commands( server_configs, server_client_configs, working_dir) commands = PerfServerBenchmarkCmds(server_cmds=server_cmds, client_cmds=client_cmds, names=names, working_dir=working_dir) # Run each server config based on execution plan for cmd_idx in range(len(client_cmds)): print(f"Server cmd: {server_cmds[cmd_idx]}") print(f"Client cmd: {client_cmds[cmd_idx]}") commands.run_cmd(cmd_idx, node_name, gpu_info, max_timeout) def generate_repro_scripts(server_configs: List[ServerConfig], server_client_configs: Dict[int, List[ClientConfig]], node_name: str, max_timeout: int): """Generate reproduction scripts for all server configs""" for server_id, server_config in enumerate(server_configs): script_content = generate_repro_script(server_config, server_client_configs[server_id], node_name, max_timeout) script_filename = f"reproduce.server-{server_config.name}.sh" with open(script_filename, 'w') as f: f.write(script_content) # Make script executable os.chmod(script_filename, 0o755) def generate_repro_script(server_config: ServerConfig, client_configs: List[ClientConfig], node_name: str, max_timeout: int) -> str: """Generate a shell script to reproduce a server config""" model_path = server_config.model_path script_content = f"""#!/bin/bash # Reproduction script for server: {server_config.name}) # Node: {node_name} set -e # Function to wait for server to be ready wait_for_server() {{ local timeout={max_timeout} local attempt=0 echo "Waiting for trtllm-serve to be ready..." while [ $((attempt * 60)) -le $timeout ]; do if ! kill -0 $SERVER_PID 2>/dev/null; then echo "Error: Server process has died" return 1 fi # Check for runtime errors in server log if grep -q "RuntimeError\\|runtime error\\|CUDA error\\|illegal memory access\\|terminate called" "$SERVER_LOG" 2>/dev/null; then echo "RuntimeError detected in server log: $SERVER_LOG" echo "Killing server process due to runtime error" kill -9 $SERVER_PID 2>/dev/null || true wait $SERVER_PID 2>/dev/null || true return 1 fi # Try to connect to server if curl -s "http://localhost:8000/v1/models" > /dev/null 2>&1; then echo "Server is ready! HTTP status: 200" return 0 fi echo "Elapsed time: $((attempt * 60)) / $timeout seconds: Server not ready yet, waiting..." sleep 60 attempt=$((attempt + 1)) done echo "Error: Server did not become ready after $timeout seconds" return 1 }} # Function to cleanup server process cleanup_server() {{ if [ -n "$SERVER_PID" ]; then echo "Stopping server" kill -9 $SERVER_PID 2>/dev/null || true wait $SERVER_PID 2>/dev/null || true sleep 5 # Give it time to clean up resources echo "Server cleanup completed" fi }} # Set trap to cleanup server on script exit trap cleanup_server EXIT # Generate extra-llm-api-config.yml CONFIG_FILENAME="extra-llm-api-config.{server_config.name}.yml" cat > "$CONFIG_FILENAME" << 'EOF' {server_config.generate_extra_llm_api_config()} EOF # Start trtllm-serve in background SERVER_LOG="trtllm-serve.{server_config.name}.log" echo "Starting trtllm-serve with command:" echo "trtllm-serve {model_path} --host localhost --port 8000 --backend pytorch --extra_llm_api_options $CONFIG_FILENAME" trtllm-serve {model_path} --host localhost --port 8000 --backend pytorch --extra_llm_api_options "$CONFIG_FILENAME" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server started with PID: $SERVER_PID" # Wait for server to be ready if ! wait_for_server; then echo "Failed to start server, exiting" exit 1 fi echo "Server is ready, starting benchmarks..." # Run benchmarks for each concurrency level """ # Add benchmark commands for each client config for client_config in client_configs: num_prompts = client_config.concurrency * client_config.iterations script_content += f""" echo "Running benchmark with concurrency: {client_config.concurrency}, iterations: {client_config.iterations}, num-prompts: {num_prompts}" BENCHMARK_LOG="trtllm-benchmark.{server_config.name}.{client_config.name}.log" echo "Running benchmark with command:" echo "python -m tensorrt_llm.serve.scripts.benchmark_serving --model {model_path} --dataset-name random --random-ids --num-prompts {num_prompts} --random-input-len {client_config.isl} --random-output-len {client_config.osl} --random-range-ratio {client_config.random_range_ratio} --ignore-eos --percentile-metrics ttft,tpot,itl,e2el --max-concurrency {client_config.concurrency}" python -m tensorrt_llm.serve.scripts.benchmark_serving \\ --model {model_path} \\ --dataset-name random \\ --random-ids \\ --num-prompts {num_prompts} \\ --random-input-len {client_config.isl} \\ --random-output-len {client_config.osl} \\ --random-range-ratio {client_config.random_range_ratio} \\ --ignore-eos \\ --percentile-metrics ttft,tpot,itl,e2el \\ --max-concurrency {client_config.concurrency} > "$BENCHMARK_LOG" 2>&1 if [ $? -eq 0 ]; then echo "Benchmark completed successfully" else echo "Benchmark failed with error code $?" fi echo "-----------------------------------------" """ script_content += f""" echo "All benchmarks completed successfully!" echo "Server will be automatically cleaned up on script exit" """ return script_content def main(): parser = argparse.ArgumentParser( description='Run benchmarks from YAML configuration file') parser.add_argument('--log_folder', required=True, help='Output folder for benchmark results') parser.add_argument('--config_file', required=True, help='Path to YAML configuration file') parser.add_argument( '--select', help='Select pattern: "r1_fp4_dep4:con1_iter1_1024_1024"') parser.add_argument('--timeout', help='Timeout in seconds', default=5400) args = parser.parse_args() try: subprocess.run(f'echo "TRT-LLM GIT COMMIT": $TRT_LLM_GIT_COMMIT', shell=True, check=True) except subprocess.CalledProcessError: print("Warning: Could not echo TRT-LLM GIT COMMIT") if not os.path.exists(args.config_file): print(f"Error: Config file '{args.config_file}' does not exist") sys.exit(1) if not os.path.exists(args.log_folder): print(f"Error: Output folder '{args.log_folder}' does not exist") sys.exit(1) # Capture system information node_name = get_node_name() gpu_info = get_gpu_info() log_folder = Path(args.log_folder) config_file = Path(args.config_file) default_max_timeout = 5400 max_timeout = int(args.timeout) if args.timeout else default_max_timeout # Change to output directory os.chdir(log_folder) # Treat empty or "default" values as None (default behavior) select_pattern = None if not args.select or args.select.lower( ) == "default" else args.select execution_plan, server_configs, server_client_configs = parse_config_file( config_file, select_pattern) generate_repro_scripts(server_configs, server_client_configs, node_name, max_timeout) run_perf_tests(server_configs, server_client_configs, max_timeout, log_folder, node_name, gpu_info) if __name__ == "__main__": main()