# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TensorRT LLM perf tests """ import os import re import shutil import sys from typing import Dict, List, NamedTuple import pytest from defs.common import convert_weights, get_cpp_benchmark, quantize_data from defs.trt_test_alternative import (is_linux, is_windows, print_info, print_warning) from ..conftest import get_llm_root, llm_models_root, trt_environment from .pytorch_model_config import get_model_yaml_config from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds, PerfMetricType, PerfScriptTestCmds, generate_test_nodes) if not hasattr(re, "Pattern"): re.Pattern = type(re.compile("")) ALLOWED_CONFIGS_CACHE = None # Cache to avoid modifying sys.path many times. MAP_BY_SOCKET = None # Model PATH of local dir synced from internal LLM models repo MODEL_PATH_DICT = { "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf", # not safetensors repo "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf", # not safetensors repo "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf", # not safetensors repo "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B", "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", "llama_v3.1_8b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4", "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B", "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8", "llama_v3.3_70b_instruct_fp8": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", "llama_v3.3_70b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_405b_instruct_fp8": "llama-3.1-model/Llama-3.1-405B-Instruct-FP8", "llama_v3.1_405b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4", "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct", "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B", "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1", "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8", "llama_v3.3_nemotron_super_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", "llama_v3.3_nemotron_super_49b_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8", "llama_v3.1_nemotron_ultra_253b": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1", "llama_v3.1_nemotron_ultra_253b_fp8": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", "llama_v4_scout_17b_16e_instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct", "llama_v4_scout_17b_16e_instruct_fp8": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8", "llama_v4_scout_17b_16e_instruct_fp4": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4", "llama_v4_maverick_17b_128e_instruct": "llama4-models/Llama-4-Maverick-17B-128E-Instruct", "llama_v4_maverick_17b_128e_instruct_fp8": "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1", "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1", "mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8", "mixtral_8x7b_v0.1_instruct_fp4": "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4", "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407", "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1", "mistral_7b_v0.1": "mistral-7b-v0.1", "ministral_8b": "Ministral-8B-Instruct-2410", "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8", "gemma_3_1b_it": "gemma/gemma-3-1b-it", "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", "qwen2_7b_instruct": "Qwen2-7B-Instruct", "qwen_14b_chat": "Qwen-14B-Chat", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", "starcoder2_3b": "starcoder2-3b", "starcoder_15b": "starcoder2-15b", "t5": "t5-small", # not supported for trtllm-bench build config "flan_t5_base": "flan-t5-small", # not supported for trtllm-bench build config "flan_t5_large": "flan-t5-xl", # not supported for trtllm-bench build config "whisper_large_v3": "whisper-models/large-v3", # not supported for trtllm-bench tokenizer "bart_large_cnn": "bart-large-cnn", # not safetensors repo "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt", "mamba_130m": "mamba/mamba-130m-hf", "mamba_370m": "mamba/mamba-370m-hf", "mamba_2.8b": "mamba/mamba-2.8b-hf", "gpt_20b": "gpt-neox-20b", "gpt_350m_moe": "gpt2-medium", "phi_3_mini_4k_instruct": "Phi-3/Phi-3-mini-4k-instruct", "phi_3_mini_128k_instruct": "Phi-3/Phi-3-mini-128k-instruct", "phi_4_mini_instruct": "Phi-4-mini-instruct", "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct", "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct", "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8", "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", } # Model PATH of HuggingFace HF_MODEL_PATH = { "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf", "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf", "falcon_180b_hf": "tiiuae/falcon-180B", "gptj_6b_hf": "EleutherAI/gpt-j-6b", "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B", "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B", "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8", "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct", "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B", "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B", "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B", "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", "llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8", "llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", "llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1", "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1", "ministral_8b_hf": "mistralai/Ministral-8B-Instruct-2410", "flan_t5_base_hf": "google/flan-t5-small", "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct", "gemma_3_1b_it_hf": "google/gemma-3-1b-it", } LORA_MODEL_PATH = { "llama_v2_13b": "llama-models-v2/chinese-llama-2-lora-13b", "mixtral_8x7b_0.1": "chinese-mixtral-lora", "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/", "ministral_8b": "lora/ministral/Ministral-8B-Instruct-2410-Loras-Dummy", # Dummy LoRA for Ministral "gemma_3_1b_it": "lora/gemma/gemma-3-1b-it-dummy-lora", # Dummy LoRA for Gemma-3-1B-Instruct "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct/vision-lora", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct/speech-lora", } TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "") TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=True "llama_v3.3_nemotron_super_49b", "llama_v3.3_nemotron_super_49b_fp8", "llama_v3.1_nemotron_ultra_253b", "llama_v3.1_nemotron_ultra_253b_fp8", } def cpu_socket_count_gt_1(): global MAP_BY_SOCKET if MAP_BY_SOCKET is not None: return MAP_BY_SOCKET if is_linux(): with open('/proc/cpuinfo') as f: cpuinfo = f.read() physical_id_set = set() for line in cpuinfo.splitlines(): if line.startswith('physical id'): _, id_ = line.split(':') physical_id_set.add(id_.strip()) MAP_BY_SOCKET = len(physical_id_set) > 1 else: MAP_BY_SOCKET = False return MAP_BY_SOCKET # A helper function to import allowed_configs.py. def import_allowed_perf_config(): if trt_environment: from llm import allowed_configs else: global ALLOWED_CONFIGS_CACHE if ALLOWED_CONFIGS_CACHE is None: sys.path.append((os.path.join(get_llm_root(), "tests/integration/defs/perf"))) import allowed_configs ALLOWED_CONFIGS_CACHE = allowed_configs else: allowed_configs = ALLOWED_CONFIGS_CACHE return allowed_configs # Regex commands used to parse the metric result for the metric type. PERF_METRIC_LOG_QUERIES = { PerfMetricType.BUILD_TIME: re.compile(r"Engine generation completed in ([\d\.]+) seconds"), PerfMetricType.INFERENCE_TIME: re.compile(r"\[BENCHMARK\].* (?:total_latency|latency)\(ms\) ([\d\.]+)"), PerfMetricType.FIRST_TOKEN_TIME: re.compile(r"\[BENCHMARK\].* avg_time_to_first_token\(ms\) ([\d\.]+)"), PerfMetricType.SEQ_LATENCY: re.compile(r"\[BENCHMARK\].* avg_sequence_latency\(ms\) ([\d\.]+)"), PerfMetricType.SEQ_THROUGHPUT: re.compile(r"\[BENCHMARK\].* seq_throughput\(seq\/sec\) ([\d\.]+)"), PerfMetricType.TOKEN_THROUGHPUT: re.compile( r"\[BENCHMARK\].* (?:token_throughput\(token\/sec\)|tokensPerSec|tokens_per_sec) ([\d\.]+)" ), PerfMetricType.INFERENCE_PEAK_GPU_MEMORY: re.compile(r"\[BENCHMARK\].* gpu_peak_mem\(gb\) ([\d\.]+)"), PerfMetricType.BUILD_PEAK_CPU_MEMORY: re.compile( r"Peak memory usage during Engine building and serialization: CPU: ([\d\.]+) .*" ), PerfMetricType.BUILD_PEAK_GPU_MEMORY: re.compile( r"Peak memory usage of TRT CPU/GPU memory allocators: CPU .*, GPU ([\d\.]+) .*" ), PerfMetricType.ENGINE_SIZE: re.compile(r".*Total engine size per GPU is ([\d\.]+) MiB.*"), PerfMetricType.CONTEXT_GPU_MEMORY: re.compile(r".*Allocated ([\d\.]+) MiB for execution context memory.*"), PerfMetricType.KV_CACHE_SIZE: re.compile(r".*Allocated ([\d\.]+) GiB for max tokens in paged KV cache.*"), } BENCH_PERF_METRIC_LOG_QUERIES = { PerfMetricType.BUILD_TIME: re.compile(r"Engine generation completed in ([\d\.]+) seconds"), PerfMetricType.INFERENCE_TIME: re.compile(r"Total Latency \(ms\):\s+([\d\.]+)"), PerfMetricType.TOKEN_THROUGHPUT: re.compile(r"GPU Output Throughput \(tps\/gpu\):\s+([\d\.]+)"), PerfMetricType.SEQ_THROUGHPUT: re.compile(r"Request Throughput \(req\/sec\):\s+([\d\.]+)"), PerfMetricType.FIRST_TOKEN_TIME: re.compile(r"Average time-to-first-token \[TTFT\] \(ms\):\s+([\d\.]+)"), PerfMetricType.OUTPUT_TOKEN_TIME: re.compile(r"Average time-per-output-token \[TPOT\] \(ms\):\s+([\d\.]+)"), PerfMetricType.KV_CACHE_SIZE: re.compile(r".*Allocated ([\d\.]+) GiB for max tokens in paged KV cache.*"), } # (Relative threshold, Absolute threshold) for all metric types PERF_METRIC_THRESHOLD = { PerfMetricType.BUILD_TIME: (0.1, 30), # Ignore build time regression < 30ms PerfMetricType.INFERENCE_TIME: (0.1, 50), # Ignore inference time regression < 50ms PerfMetricType.FIRST_TOKEN_TIME: (0.1, 50), # Ignore first token time regression < 50ms PerfMetricType.OUTPUT_TOKEN_TIME: (0.1, 50), # Ignore per output token time regression < 50ms PerfMetricType.SEQ_LATENCY: (0.1, 50), # Ignore latency regression < 50ms PerfMetricType.TOKEN_THROUGHPUT: ( -0.1, 10 ), # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better. PerfMetricType.SEQ_THROUGHPUT: ( -0.1, 10 ), # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better. PerfMetricType.INFERENCE_PEAK_GPU_MEMORY: (0.1, 0.1), # Ignore inference peak gpu memory regression < 0.1GiB PerfMetricType.BUILD_PEAK_CPU_MEMORY: (0.1, 100), # Ignore build peak cpu memory regression < 100MiB PerfMetricType.BUILD_PEAK_GPU_MEMORY: (0.1, 100), # Ignore build peak gpu memory regression < 100MiB PerfMetricType.ENGINE_SIZE: (0.3, 100), # Ignore engine size regression < 100MiB PerfMetricType.CONTEXT_GPU_MEMORY: (0.1, 50), # Ignore context GPU memory < 50MiB PerfMetricType.KV_CACHE_SIZE: (-0.1, 50), # Ignore value < 50MiB } BUILDER_METRICS = [ PerfMetricType.BUILD_TIME, PerfMetricType.BUILD_PEAK_CPU_MEMORY, PerfMetricType.BUILD_PEAK_GPU_MEMORY, PerfMetricType.ENGINE_SIZE ] INFERENCE_METRICS = [ PerfMetricType.INFERENCE_TIME, PerfMetricType.INFERENCE_PEAK_GPU_MEMORY, PerfMetricType.CONTEXT_GPU_MEMORY, ] BERT_CPP_INFERENCE_METRICS = [ PerfMetricType.INFERENCE_TIME, PerfMetricType.CONTEXT_GPU_MEMORY, ] MANAGER_INFERENCE_METRICS = [ PerfMetricType.INFERENCE_TIME, PerfMetricType.TOKEN_THROUGHPUT, PerfMetricType.CONTEXT_GPU_MEMORY, PerfMetricType.SEQ_THROUGHPUT, PerfMetricType.SEQ_LATENCY, PerfMetricType.KV_CACHE_SIZE, ] BENCH_INFERENCE_METRICS = [ PerfMetricType.INFERENCE_TIME, PerfMetricType.TOKEN_THROUGHPUT, PerfMetricType.SEQ_THROUGHPUT, PerfMetricType.KV_CACHE_SIZE, ] class PerfTestMetric(NamedTuple): """ Configurations of a test metric. """ # The original test name used to run the oraginal perf test. original_test_name: str # The name for this particular metric. metric_name: str # The type of this metric. metric_type: PerfMetricType # The regex used to parse this metric. metric_regex: re.Pattern # The relative threshold to allow for regressions. metric_threshold: float # The absolute threshold to allow for regressions. metric_abs_threshold: float # The index of the command of this metric. # Currently, we run 1 build command plus N benchmark commands. cmd_idx: int class PerfTestConfig: """ Configurations defining the LLM perf test. This should hold only the attributes that distinguish different tests. """ def __init__( self, *, model_name: str = "", runtime: str = "python", static_batching: str = "", api: str = "", streaming: str = "", backend: str = "", mode: str = "plugin", data_type: str = "float16", max_batch_size: int = 512, max_num_tokens: int = 2048, gpu_weights_percent: float = -1, batch_sizes: List[int] = [0], input_lens: List[int] = [8], output_lens: List[int] = [1], num_beams: int = 1, num_loras: int = 0, num_reqs: int = 512, concurrency: int = -1, quantization: str = "", kv_cache_free_gpu_mem_fraction: float = 0.9, kv_cache_dtype: str = "auto", ep_size: int = None, tp_size: int = 1, pp_size: int = 1, num_gpus: int = 1, ): # The model name. self.model_name = model_name # Python or cpp/cppmanager runtime. self.runtime = runtime # static batching for gptManagerBenchmark self.static_batching = static_batching # API Type: only executor is allowed self.api = api # Backend Type: pytorch or cpp self.backend = backend # Streaming responses self.streaming = streaming # Plugin or OOTB mode. self.mode = mode # Activation dtype. self.data_type = data_type # Percentage of weights that resides on GPU. self.gpu_weights_percent = gpu_weights_percent # Max Batch Size to build TRT engine with. self.max_batch_size = max_batch_size # Max number of tokens to build TRT engine with. self.max_num_tokens = max_num_tokens # List of batch sizes to run benchmark with. self.batch_sizes = batch_sizes # List of input lens to run benchmark with. self.input_lens = input_lens # List of output lens to run benchmark with. self.output_lens = output_lens # Number of beams. self.num_beams = num_beams # Number of loras. self.num_loras = num_loras # Number of requests. self.num_reqs = num_reqs # Number of concurrency self.concurrency = concurrency # Quantization type. self.quantization = quantization # KV cache free gpu mem fraction self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction # KV Cache dtype self.kv_cache_dtype = kv_cache_dtype # Multiple Profiles self.multiple_profiles = False # EP Size self.ep_size = ep_size # TP Size self.tp_size = tp_size # PP Size self.pp_size = pp_size # Number of GPUs. self.num_gpus = num_gpus # Just build engines self.build_only = False def to_string(self, custom_bs: int = None, custom_input_len: int = None, custom_output_len: int = None) -> str: # First, add the model name. entries = [self.model_name] if self.runtime == "cpp": # bertBenchmark runtime entries.append(f"cpp") elif self.runtime == "cppmanager": # gptManagerBenchmark runtime entries.append(f"cppmanager") if self.api == "exe": # executor entries.append(f"exe") if self.streaming == "streaming": entries.append(f"streaming") if self.static_batching == "static_batching": entries.append(f"static_batching") elif self.runtime == "bench": # trtllm-bench entries.append(f"bench") if self.backend == 'pytorch': entries.append(f"pytorch") if self.streaming == "streaming": entries.append(f"streaming") # Add mode and dtype. if self.runtime != "bench": entries.append(self.mode) entries.append(self.data_type) if self.gpu_weights_percent != -1: entries.append(f"gwp:{self.gpu_weights_percent}") if self.multiple_profiles: entries.append(f"mp") # Add Max batch size. entries.append(f"maxbs:{self.max_batch_size}") # Add Max number of tokens. entries.append(f"maxnt:{self.max_num_tokens}") # Add kv cache free gpu mem fraction. if self.kv_cache_free_gpu_mem_fraction != 0.9: entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}") if self.build_only: entries.append(f"build_only") if self.batch_sizes[0] > 0: # Add batch size(s). if custom_bs is None: bs_label = "+".join([str(x) for x in self.batch_sizes]) else: bs_label = str(custom_bs) entries.append(f"bs:{bs_label}") # Add input/output lens. if len(self.output_lens) > 0: if custom_input_len is None: io_lens = [] for in_len, out_len in zip(self.input_lens, self.output_lens): io_lens.append(f"{in_len},{out_len}") io_len_label = "+".join(io_lens) else: assert custom_output_len is not None, \ "custom_output_len must be provided if custom_input_len is specified!" io_len_label = f"{custom_input_len},{custom_output_len}" entries.append(f"input_output_len:{io_len_label}") else: if custom_input_len is None: len_label = "+".join([str(x) for x in self.input_lens]) else: len_label = custom_input_len entries.append(f"input_len:{len_label}") # Add number of beams. if self.num_beams > 1: entries.append(f"beams:{self.num_beams}") # Add number of loras. if self.num_loras > 0: entries.append(f"loras:{self.num_loras}") # Add quantization type. if self.quantization != "": entries.append(f"quant:{self.quantization}") # Add kv cache dtype. if self.kv_cache_dtype != "auto": entries.append(f"kv_cache_dtype:{self.kv_cache_dtype}") # Add number of requests. if self.num_reqs != 512: entries.append(f"reqs:{self.num_reqs}") #Add number of concurrency if self.concurrency != -1: entries.append(f"con:{self.concurrency}") #Add EP Size. if self.ep_size != None: entries.append(f"ep:{self.ep_size}") # Add TP Size. if self.tp_size > 1 and self.tp_size != self.num_gpus: entries.append(f"tp:{self.tp_size}") # Add PP Size. if self.pp_size > 1: entries.append(f"pp:{self.pp_size}") # Add number of GPUs. if self.num_gpus > 1: entries.append(f"gpus:{self.num_gpus}") # Concatenate labels with "-". return "-".join(entries) def __str__(self) -> str: return self.to_string() def load_from_str(self, test_param_labels) -> None: """ Populate the config properties given the test param string. """ # Extract configs from test param labels. labels = test_param_labels.split("-") self.model_name = labels.pop(0) assert labels[0] in ["cpp", "cppmanager", "bench"], \ f"Invalid runtime {labels[0]}!" self.runtime = labels.pop(0) self.api = labels.pop(0) if labels[0] == "exe" else "" self.backend = labels.pop(0) if labels[0] == "pytorch" else "" self.streaming = labels.pop(0) if labels[0] == "streaming" else "" self.static_batching = labels.pop( 0) if labels[0] == "static_batching" else "" if self.runtime != "bench": self.mode = labels.pop(0) self.data_type = labels.pop(0) if labels[0].startswith("gwp"): self.gpu_weights_percent = float(labels.pop(0).replace("gwp:", "")) if labels[0] == "mp": self.multiple_profiles = True labels.pop(0) if labels[0].startswith("maxbs"): self.max_batch_size = int(labels.pop(0).replace("maxbs:", "")) if labels[0].startswith("maxnt"): self.max_num_tokens = int(labels.pop(0).replace("maxnt:", "")) if labels[0].startswith("kv_frac"): self.kv_cache_free_gpu_mem_fraction = float( labels.pop(0).replace("kv_frac:", "")) if labels[0] == "build_only": self.build_only = True labels.pop(0) if not self.build_only: if labels[0].startswith("bs:"): self.batch_sizes = [ int(x) for x in labels.pop(0).replace("bs:", "").split("+") ] else: self.batch_sizes = [0] if labels[0].startswith("input_output_len"): io_lens = labels.pop(0).replace("input_output_len:", "").split("+") self.input_lens = [int(x.split(",")[0]) for x in io_lens] self.output_lens = [int(x.split(",")[1]) for x in io_lens] elif labels[0].startswith("input_len"): self.input_lens = [ int(x) for x in labels.pop(0).replace("input_len:", "").split("+") ] self.output_lens = [] else: raise RuntimeError( f"Unexpected test name label for seq lens: {labels[0]}!") if len(labels) > 0: self.num_beams = 1 if not labels[0].startswith("beams:") else int( labels.pop(0).replace("beams:", "")) if len(labels) > 0: self.num_loras = 0 if not labels[0].startswith("loras:") else int( labels.pop(0).replace("loras:", "")) if len(labels) > 0: self.quantization = "" if not labels[0].startswith( "quant:") else labels.pop(0).replace("quant:", "") if len(labels) > 0: self.kv_cache_dtype = "auto" if not labels[0].startswith( "kv_cache_dtype:") else labels.pop(0).replace( "kv_cache_dtype:", "") if len(labels) > 0: self.num_reqs = 512 if not labels[0].startswith("reqs:") else int( labels.pop(0).replace("reqs:", "")) if len(labels) > 0: self.concurrency = -1 if not labels[0].startswith("con:") else int( labels.pop(0).replace("con:", "")) if len(labels) > 0: self.ep_size = None if not labels[0].startswith("ep:") else int( labels.pop(0).replace("ep:", "")) if len(labels) > 0: self.tp_size = 1 if not labels[0].startswith("tp:") else int( labels.pop(0).replace("tp:", "")) if len(labels) > 0: self.pp_size = 1 if not labels[0].startswith("pp:") else int( labels.pop(0).replace("pp:", "")) if len(labels) > 0: self.num_gpus = 1 if not labels[0].startswith("gpus:") else int( labels.pop(0).replace("gpus:", "")) assert len( labels ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}" # Validate the parsed config. self.validate() def validate(self): """ Validate if the config makes sense. """ # Validate model name. assert len(self.model_name) > 0, "model_name must not be empty!" assert "-" not in self.model_name, "model_name must not contain '-' character!" if self.model_name not in MODEL_PATH_DICT.keys( ) and self.model_name not in HF_MODEL_PATH.keys(): allowed_configs = import_allowed_perf_config() allowed_models = allowed_configs.get_allowed_models() assert self.model_name in allowed_models, f"model_name {self.model_name} is not in allowed_models!" # Validate runtime type. VALID_RUNTIMES = ["cpp", "cppmanager", "bench"] assert self.runtime in VALID_RUNTIMES, f"Invalid runtime {self.runtime}!" # Validate plugin mode. VALID_MODES = ["plugin", "ootb", "ootb_except_mha"] if self.runtime == "cppmanager": VALID_MODES += ["plugin_ifb"] assert self.mode in VALID_MODES, f"Invalid mode {self.mode}!" # Validate dtype. VALID_DTYPES = ["float32", "float16", "bfloat16", "float8", "float4"] assert self.data_type in VALID_DTYPES, f"Invalid data_type {self.data_type}!" VALID_KV_CACHE_DTYPES = ["auto", "fp8"] assert self.kv_cache_dtype in VALID_KV_CACHE_DTYPES, f"Invalid kv_cache_dtype {self.kv_cache_dtype}!" # Validate quantization mode. if self.model_name in MODEL_PATH_DICT.keys(): VALID_QUANTS = [ "", "nvfp4", "fp8", "int8", "int4_awq", "w4a8_awq", "w4a16_awq", "int4_wo", "full_prec" ] else: VALID_QUANTS = [ "", "fp8", "fp8_gemm", "fp8_kv_cache", "int8_sq_per_tensor", "int8_sq_per_token_channel", "int8_weight_only", "int4_weight_only", "int4_weight_only_awq", "int4_weight_only_gptq", ] assert self.quantization in VALID_QUANTS, f"Invalid quantization {self.quantization}!" if self.backend == "pytorch": assert self.quantization == "", f"Not support passing quantization {self.quantization} for pytorch backend!" assert self.num_beams >= 1, f"Invalid num_beams: {self.num_beams}!" assert self.num_loras >= 0, f"Invalid num_loras: {self.num_loras}!" assert self.num_reqs >= 1, f"Invalid num_reqs: {self.num_reqs}!" if self.pp_size > 1: assert self.model_name in MODEL_PATH_DICT.keys( ), f"Invalid model name for pp size {self.pp_size} test" if self.num_gpus > 1 and self.tp_size == 1 and self.pp_size == 1: self.tp_size = self.num_gpus if self.tp_size > 1 or self.pp_size > 1 and self.num_gpus == 1: self.num_gpus = self.tp_size * self.pp_size assert self.num_gpus == self.tp_size * self.pp_size, f"Num of GPU shall be equal to TP*PP: {self.num_gpus}, {self.tp_size}, {self.pp_size}" if self.gpu_weights_percent != -1: assert 0 <= self.gpu_weights_percent <= 1, f"Invalid gpu_weights_percent: {self.gpu_weights_percent}!" if not self.build_only: if self.runtime != "cppmanager" and self.runtime != "bench": print(f"runtime: {self.runtime}") # Validate max batch size. if self.max_batch_size > 0: assert max( self.batch_sizes ) <= self.max_batch_size, f"Batch Size larger than Max Batch Size!" # Validate bs, seq lens, and num_beams. assert len( self.batch_sizes ) > 0 and self.batch_sizes[0] > 0, f"Empty batch sizes!" assert self.static_batching == "", f"Static Batching only valid for gptManagerBenchmark!" assert self.api == "", f"API Type only valid for gptManagerBenchmark!" assert self.streaming == "", f"Streaming only valid for gptManagerBenchmark and trtllm-bench!" assert len(self.input_lens) > 0, f"Empty input_lens!" if self.is_bert_like(): assert len( self.output_lens ) == 0, f"BERT-like models must not have output_lens!" else: assert len( self.output_lens ) > 0, f"GPT-like models and enc-dec models must have output_lens!" # BERT with small BS is very unstable. Try to avoid it. if self.is_bert_like(): if self.runtime == "trtllm-bench": self.batch_sizes[ 0] = self.max_batch_size if self.max_batch_size > 0 else 1 print(f"batch_sizes: {self.batch_sizes}") assert all( [b >= 32 for b in self.batch_sizes] ), f"BERT with small BS is very unstable! Please increase to at least 32." # GPT-350m and Bloom-560m with small BS are very unstable. Only run these small models with larger BS. if self.model_name in ["gpt_350m", "bloom_560m"]: assert all( [b >= 32 for b in self.batch_sizes] ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32." def get_model_family(self) -> str: """ Get the model family of the current model. """ allowed_configs = import_allowed_perf_config() allowed_models = allowed_configs.get_allowed_models() if self.model_name in allowed_models: return allowed_configs.get_model_family(self.model_name) else: return "" def is_mamba_family(self) -> bool: """ Check if the current model family is Mamba. """ return self.get_model_family() == 'mamba' def is_moe_family(self) -> bool: """ Check if the current model family is MoE. """ allowed_configs = import_allowed_perf_config() allowed_models = allowed_configs.get_allowed_models() if self.model_name in allowed_models: model_config = allowed_configs.get_model_config(self.model_name) return model_config['moe_num_experts'] > 0 and model_config[ 'moe_top_k'] > 0 else: return False def get_benchmark_type(self) -> str: """ Get the benchmark type of the current model. """ allowed_configs = import_allowed_perf_config() allowed_models = allowed_configs.get_allowed_models() if self.model_name in allowed_models: return allowed_configs.get_benchmark_type(self.model_name) else: return "" def is_bert_like(self) -> bool: """ Check if the current benchmark is a BERT benchmark. """ return self.get_benchmark_type() == "bert" def is_enc_dec(self) -> bool: """ Check if the current benchmark is a EncDec benchmark. """ return self.get_benchmark_type() == "enc_dec" class MultiMetricPerfTest(AbstractPerfScriptTestClass): """ Base class for perf tests with multiple metrics. """ def __init__(self, full_test_name: str): # full_test_name is the full test name appearing in test output. self._full_test_name = full_test_name # test_domain_name is the part before "::". self._test_domain_name = "::".join(full_test_name.split("::")[:-1]) # short_test_name is the part after "::". self._short_test_name = full_test_name.split("::")[-1] # short_test_name_body is the part before "[" in short_test_name. self._short_test_name_body = self._short_test_name.split("[")[0] # test_param_labels is the part inside "[...]". self._test_param_labels = full_test_name.split("[")[-1][:-1] # Load test config from test name. self._config = PerfTestConfig() self._config.load_from_str(self._test_param_labels) # This will store the currently running metric. self._current_metric = None self.lora_dirs = [] def get_test_name(self) -> str: return str(self._config) def set_runtime_configs(self, llm_root, working_dir, perf_cache_fpath) -> None: if self._config.runtime == "cpp": if not self._config.is_bert_like(): raise ValueError( f"Invalid config: '{self._config.runtime}' is only supported for bert-like models!" ) benchmark_script = get_cpp_benchmark("bertBenchmark", llm_root) elif self._config.runtime == "cppmanager": benchmark_script = get_cpp_benchmark("gptManagerBenchmark", llm_root) elif self._config.runtime == "bench": benchmark_script = "trtllm-bench" else: raise RuntimeError(f"Invalid runtime {self._config.runtime}.") allowed_configs = import_allowed_perf_config() allowed_models = allowed_configs.get_allowed_models() if self._config.runtime == "bench": build_script = "trtllm-bench" elif self._config.pp_size > 1 or self._config.model_name not in allowed_models: build_script = "trtllm-build" else: # build.py is used to build engines for both python and cpp runtime build_script = os.path.join(llm_root, "tests/integration/defs/perf/build.py") self._build_script = build_script self._benchmark_script = benchmark_script self._working_dir = working_dir self._perf_cache_fpath = perf_cache_fpath self._llm_root = llm_root def get_convert_weights_command(self, model_dir, engine_dir) -> str: """ Get the convert checkpoint command. """ if "phi" in self._config.model_name: example_name = "phi" else: example_name = "llama" if self._config.quantization != "": command, checkpoint_dir = quantize_data( llm_venv=None, example_root=os.path.join(get_llm_root(), "examples", "models", "core", example_name), model_dir=model_dir, calib_dataset=os.path.join(llm_models_root(), "datasets", "cnn_dailymail"), dtype=self._config.data_type, qformat=self._config.quantization, tp_size=self._config.tp_size, pp_size=self._config.pp_size, quantize_dir=engine_dir) else: command, checkpoint_dir = convert_weights( llm_venv=None, example_root=os.path.join(get_llm_root(), "examples", "models", "core", example_name), cmodel_dir=engine_dir, model=self._config.model_name, model_path=model_dir, tp_size=self._config.tp_size, pp_size=self._config.pp_size, data_type=self._config.data_type) command = [f"python3"] + command return command, checkpoint_dir def get_convert_lora_weights_command(self, model_dir, engine_dir) -> str: script = os.path.join(self._llm_root, "examples", "hf_lora_convert.py") checkpoint_dir = os.path.join(engine_dir, "lora_cpp") command = [ script, f"-i={model_dir}", "--storage-type=float16", f"-o={checkpoint_dir}" ] command = [f"python3"] + command return command, checkpoint_dir def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list: build_cmd = [ self._build_script, f"--output_dir={engine_dir}", f"--checkpoint_dir={checkpoint_dir}", f"--workers={self._config.tp_size}", f"--use_paged_context_fmha=enable", f"--monitor_memory", f"--max_batch_size={self._config.max_batch_size}" ] # For Multiple Profiles if self._config.multiple_profiles: build_cmd.append(f"--multiple_profiles=enable") else: build_cmd.append(f"--multiple_profiles=disable") num_beams = self._config.num_beams if num_beams > 1: build_cmd.append(f"--max_beam_width={num_beams}") gpu_percent = self._config.gpu_weights_percent if gpu_percent != -1: build_cmd += [f"--weight_streaming"] # For engine inspector build_cmd.append("--profiling_verbosity=layer_names_only") if self._config.num_loras > 0: if "mixtral" in self._config.model_name: build_cmd.append(f"--lora_plugin=auto") build_cmd.append(f"--moe_plugin=auto") build_cmd.append(f"--lora_target_modules") build_cmd.append(f"attn_q") build_cmd.append(f"attn_k") build_cmd.append(f"attn_v") build_cmd.append(f"attn_dense") build_cmd.append(f"moe_h_to_4h") build_cmd.append(f"moe_4h_to_h") build_cmd.append(f"moe_gate") build_cmd.append(f"moe_router") elif "llama" in self._config.model_name: build_cmd.append(f"--lora_plugin=float16") build_cmd.append(f"--lora_target_modules") build_cmd.append(f"attn_q") build_cmd.append(f"attn_k") build_cmd.append(f"attn_v") build_cmd.append(f"attn_dense") build_cmd.append(f"mlp_h_to_4h") build_cmd.append(f"mlp_4h_to_h") build_cmd.append(f"mlp_gate") if TIMING_CACHE_DIR and not self._config.build_only: timing_cache = os.path.join(TIMING_CACHE_DIR, "model.cache") build_cmd.append(f"--input_timing_cache={timing_cache}") build_cmd.append(f"--output_timing_cache={timing_cache}") return build_cmd def get_trtllm_bench_model(self): model_dir = "" if self._config.model_name in MODEL_PATH_DICT.keys(): model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[self._config.model_name]) elif self._config.model_name in HF_MODEL_PATH.keys(): model_dir = os.path.join( llm_models_root(), MODEL_PATH_DICT[self._config.model_name.split('_hf')[0]]) return model_dir def get_trtllm_bench_build_command(self, engine_dir) -> list: model_dir = self.get_trtllm_bench_model() if model_dir == "": pytest.skip("Model Name is not supported by trtllm-bench") model_name = self._config.model_name if not model_name.endswith("_hf"): model_name = model_name + "_hf" hf_model_name = HF_MODEL_PATH.get(model_name, "") build_cmd = [ self._build_script, f"--log_level=info", f"--workspace={engine_dir}", f"--model={hf_model_name}", f"--model_path={model_dir}", "build", f"--tp_size={self._config.tp_size}", f"--pp_size={self._config.pp_size}" ] max_seq_len = max(self._config.input_lens) + max( self._config.output_lens) build_cmd.append(f"--max_seq_len={max_seq_len}") # Add max_batch_size and max_num_tokens to ensure build matches runtime configuration # Note: trtllm-bench requires both to be specified together (option group constraint) assert self._config.max_batch_size > 0, f"max_batch_size must be > 0, got {self._config.max_batch_size}" assert self._config.max_num_tokens > 0, f"max_num_tokens must be > 0, got {self._config.max_num_tokens}" build_cmd.append(f"--max_batch_size={self._config.max_batch_size}") build_cmd.append(f"--max_num_tokens={self._config.max_num_tokens}") if self._config.quantization: build_cmd.append( f"--quantization={self._config.quantization.upper()}") if self._config.model_name in TRUST_REMOTE_CODE_MODELS: build_cmd.append(f"--trust_remote_code=True") return build_cmd def get_benchmark_build_command(self, engine_dir) -> list: mode_flag = self._config.mode.replace("_", "-") build_cmd = [ self._build_script, f"--model={self._config.model_name}", "--log_level=info", f"--mode={mode_flag}", f"--dtype={self._config.data_type}", f"--output_dir={engine_dir}", "--monitor_memory" ] if self._config.quantization != "": build_cmd.append(f"--quantization={self._config.quantization}") num_beams = self._config.num_beams if num_beams > 1: build_cmd.append(f"--max_beam_width={num_beams}") gpu_percent = self._config.gpu_weights_percent if gpu_percent != -1: build_cmd += [f"--weight_streaming"] if self._config.max_batch_size > 0: build_cmd.append(f"--max_batch_size={self._config.max_batch_size}") # For performance data stability, set opt_num_token/opt_batch_size to 8 when max batch size is greater than 8. # The script will use the settings from allow_configs.py if max_batch_size is set to 0, # opt_num_token/opt_batch_size is also necessary for stability. if self._config.max_batch_size > 8 or self._config.max_batch_size == 0: if self._config.mode in ["plugin_ifb", "plugin", 'ootb_except_mha']: build_cmd.append("--opt_num_tokens=8") else: build_cmd.append("--opt_batch_size=8") # For Multiple Profiles if self._config.multiple_profiles: build_cmd.append("--multiple_profiles") # For engine inspector build_cmd.append("--profiling_verbosity=layer_names_only") if TIMING_CACHE_DIR and not self._config.build_only: timing_cache = os.path.join(TIMING_CACHE_DIR, "model.cache") build_cmd.append(f"--input_timing_cache={timing_cache}") build_cmd.append(f"--output_timing_cache={timing_cache}") return build_cmd def get_prepare_data_command(self, engine_dir, input_len, output_len) -> list: data_cmd = [] prepare_data_script = os.path.join(self._llm_root, "benchmarks", "cpp", "prepare_dataset.py") if self._config.model_name in MODEL_PATH_DICT.keys(): tokenizer_dir = os.path.join( llm_models_root(), MODEL_PATH_DICT[self._config.model_name]) elif self._config.model_name in HF_MODEL_PATH.keys(): tokenizer_dir = HF_MODEL_PATH[self._config.model_name] else: tokenizer_dir = os.path.join(llm_models_root(), "llama-models", "llama-7b-hf") if not os.path.exists(engine_dir): os.makedirs(engine_dir, exist_ok=True) if self._config.num_loras > 0: istdev = 16 ostdev = 24 nloras = self._config.num_loras dataset_path = os.path.join(engine_dir, "synthetic_data.json") if self._config.model_name in LORA_MODEL_PATH.keys( ) and self._config.backend == "pytorch" and self._config.runtime == "bench": actual_lora_paths = LORA_MODEL_PATH[self._config.model_name] if not isinstance(actual_lora_paths, list): actual_lora_paths = [actual_lora_paths] for i, actual_lora_path in enumerate(actual_lora_paths): if not actual_lora_path.startswith("/"): actual_lora_paths[i] = os.path.join( llm_models_root(), actual_lora_path) lora_dir = os.path.join(engine_dir, "loras") data_cmd += [f"mkdir -p {lora_dir}", ";"] if len(actual_lora_paths) != nloras: raise ValueError( f"Number of LoRA paths ({len(actual_lora_paths)}) does not match requested number of LoRAs ({nloras})" ) for i, lora_path in enumerate(actual_lora_paths): self.lora_dirs.append(f"{lora_dir}/{i}") data_cmd += [f"ln -sf {lora_path} {lora_dir}/{i}", ";"] data_cmd += [ "python3", prepare_data_script, f"--stdout", f"--rand-task-id 0 {nloras-1}", f"--tokenizer={tokenizer_dir}", f"--lora-dir={lora_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", f"--input-stdev={istdev}", f"--output-stdev={ostdev}", f" > {dataset_path}" ] elif self._config.backend == "cppmanager": data_cmd += [ "python3", prepare_data_script, f"--stdout", f"--rand-task-id 0 {nloras-1}", f"--tokenizer={tokenizer_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", f"--input-stdev={istdev}", f"--output-stdev={ostdev}", f" > {dataset_path}" ] # generate LoRA weights for C++ runtime # the lora_dir is $engine_dir/loras. This is populated by the convert_lora_cmd executed before this. # The generate_rand_loras.py will create random lora weights to $engine_dir/lora_cpp. generate_rand_lora_script = os.path.join( self._llm_root, "benchmarks", "cpp", "utils", "generate_rand_loras.py") checkpoint_dir = os.path.join(engine_dir, "lora_cpp") data_cmd += [ "python3", generate_rand_lora_script, checkpoint_dir, lora_dir, str(nloras) ] else: pytest.skip( f"LoRA config not supported for {self._config.model_name} with the current backend and runtime." ) else: istdev = 0 ostdev = 0 dataset_path = os.path.join(engine_dir, "synthetic_data.json") if self._build_script == 'trtllm-bench': data_cmd += [ "python3", prepare_data_script, "--stdout", f"--tokenizer={tokenizer_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", f"--input-stdev={istdev}", f"--output-stdev={ostdev}", f" > {dataset_path}" ] else: data_cmd += [ "python3", prepare_data_script, f"--output={dataset_path}", f"--tokenizer={tokenizer_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", f"--input-stdev={istdev}", f"--output-stdev={ostdev}" ] return data_cmd def get_python_runtime_benchmark_command(self, engine_dir, bs, input_len, output_len): benchmark_cmd = [ self._benchmark_script, ] if self._config.is_bert_like(): model = "enc" benchmark_cmd.append(f"--engine_dir={engine_dir}") elif self._config.is_enc_dec(): model = "enc-dec" benchmark_cmd.append( f"--encoder_engine_dir={os.path.join(engine_dir, 'encoder')}") benchmark_cmd.append( f"--decoder_engine_dir={os.path.join(engine_dir, 'decoder')}") else: model = "dec" benchmark_cmd.append(f"--engine_dir={engine_dir}") benchmark_cmd.append(f"--model={model}") benchmark_cmd += [f"--batch_size={bs}"] # Use 3 warm-up runs and minimum of 10 actual runs and minimum of 10 seconds for now. benchmark_cmd += [f"--warm_up=3", f"--num_runs=10", f"--duration=10"] benchmark_cmd += [f"--dtype={self._config.data_type}"] if self._config.is_bert_like(): benchmark_cmd.append(f"--input_len={input_len}") else: benchmark_cmd.append(f"--input_output_len={input_len},{output_len}") # Weight streaming don't support CUDA Graph for now. gpu_percent = self._config.gpu_weights_percent if gpu_percent == -1: benchmark_cmd.append(f"--enable_cuda_graph") return benchmark_cmd def get_gpt_session_runtime_benchmark_command(self, engine_dir, bs, input_len, output_len): benchmark_cmd = [ self._benchmark_script, # This is required to get context GPU info f"--log_level=info", ] benchmark_cmd.append(f"--engine_dir={engine_dir}") if self._config.is_bert_like(): benchmark_cmd.append(f"--model={self._config.model_name}") num_beams = self._config.num_beams if num_beams > 1: benchmark_cmd.append(f"--beam_width={num_beams}") gpu_percent = self._config.gpu_weights_percent if gpu_percent != -1: benchmark_cmd.append(f"--gpu_weights_percent={gpu_percent}") benchmark_cmd += [f"--batch_size={bs}"] # Use 3 warm-up runs and minimum of 10 actual runs and minimum of 10 seconds for now. benchmark_cmd += [f"--warm_up=3", f"--num_runs=10", f"--duration=10"] if not self._config.is_bert_like() and not self._config.is_enc_dec( ) and not self._config.is_mamba_family() and self._config.num_gpus < 8: # Dump layer information and per-layer profile benchmark_cmd += ["--dump_layer_info", "--dump_profile"] # For GPT Models and enc-dec Models if not self._config.is_bert_like(): benchmark_cmd.append(f"--input_output_len={input_len},{output_len}") # Weight streaming don't support CUDA Graph for now. # MoE OOTB doesn't support CUDA Graph gpu_percent = self._config.gpu_weights_percent if gpu_percent == -1 and not (self._config.is_moe_family() and self._config.mode in ['ootb', 'ootb_except_mha']): benchmark_cmd.append(f"--enable_cuda_graph") # For BERT Models: else: benchmark_cmd.append(f"--input_len={input_len}") return benchmark_cmd def get_trtllm_bench_command(self, engine_dir): model_dir = self.get_trtllm_bench_model() model_name = self._config.model_name dataset_path = os.path.join(engine_dir, "synthetic_data.json") report_path = os.path.join(engine_dir, "report.json") if not model_name.endswith("_hf"): model_name = model_name + "_hf" hf_model_name = HF_MODEL_PATH.get(model_name, "") tp_pp_str = f"tp_{self._config.tp_size}_pp_{self._config.pp_size}" engine_dir = os.path.join(engine_dir, hf_model_name, tp_pp_str) benchmark_cmd = [ self._benchmark_script, f"--model={model_name}", f"--model_path={model_dir}", "throughput", f"--dataset={dataset_path}", f"--max_batch_size={self._config.max_batch_size}", f"--max_num_tokens={self._config.max_num_tokens}", f"--report_json={report_path}", f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}", ] if self._config.backend != "pytorch": benchmark_cmd += [ f"--backend=tensorrt", f"--engine_dir={engine_dir}" ] else: benchmark_cmd += ["--backend=pytorch"] if self._config.num_reqs > 0: benchmark_cmd += [f"--num_requests={self._config.num_reqs}"] if self._config.concurrency != -1: benchmark_cmd += [f"--concurrency={self._config.concurrency}"] if self._config.ep_size != None: benchmark_cmd += [f"--ep={self._config.ep_size}"] if self._config.tp_size > 1: benchmark_cmd += [f"--tp={self._config.tp_size}"] if self._config.pp_size > 1: benchmark_cmd += [f"--pp={self._config.pp_size}"] if self._config.streaming == "streaming": benchmark_cmd += [f"--streaming"] #use default yaml config if self._config.backend == "pytorch": import yaml pytorch_config_path = os.path.join(engine_dir, "extra-llm-api-config.yml") if not os.path.exists(pytorch_config_path): os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True) config = get_model_yaml_config(self._config.to_string(), lora_dirs=self.lora_dirs) print_info(f"pytorch model config: {config}") with open(pytorch_config_path, 'w') as f: yaml.dump(config, f, default_flow_style=False) benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"] return benchmark_cmd def get_gpt_manager_runtime_benchmark_command(self, engine_dir, bs, input_len): benchmark_cmd = [ self._benchmark_script, # This is required to get context GPU info f"--log_level=info", ] if self._config.is_enc_dec(): benchmark_cmd.append( f"--encoder_engine_dir={os.path.join(engine_dir, 'encoder')}") benchmark_cmd.append( f"--decoder_engine_dir={os.path.join(engine_dir, 'decoder')}") else: benchmark_cmd.append(f"--engine_dir={engine_dir}") num_beams = self._config.num_beams if num_beams > 1: benchmark_cmd.append(f"--beam_width={num_beams}") gpu_percent = self._config.gpu_weights_percent if gpu_percent != -1: benchmark_cmd.append(f"--gpu_weights_percent={gpu_percent}") if self._config.num_loras > 0: nloras = self._config.num_loras dataset_path = os.path.join(engine_dir, f"token-norm-dist-lora-{nloras}.json") lora_dir = os.path.join(engine_dir, f"loras") eos_id = 2 num_layers = 32 if "mixtral" in self._config.model_name else 40 num_lora_mods = 8 if "mixtral" in self._config.model_name else 7 max_lora_rank = 64 benchmark_cmd += [f"--lora_host_cache_bytes=8589934592"] benchmark_cmd += [ f"--lora_num_device_mod_layers={32 * num_layers * num_lora_mods * max_lora_rank}" ] benchmark_cmd += [f"--eos_id={eos_id}"] benchmark_cmd += [f"--lora_dir={lora_dir}"] else: dataset_path = os.path.join(engine_dir, "synthetic_data.json") benchmark_cmd += [f"--dataset={dataset_path}"] # API Type is executor if self._config.api == "exe": benchmark_cmd += [f"--api=executor"] if self._config.mode == "plugin_ifb": benchmark_cmd += [ f"--type=UIFB" ] if self._config.is_mamba_family() else ["--type=IFB"] else: benchmark_cmd += [f"--type=V1"] if self._config.streaming == "streaming": benchmark_cmd += [f"--streaming"] benchmark_cmd += [f"--scheduler_policy=max_utilization"] if self._config.static_batching == "static_batching": benchmark_cmd += [f"--static_emulated_batch_size={bs}"] if self._config.concurrency != -1: benchmark_cmd += [f"--concurrency={self._config.concurrency}"] return benchmark_cmd def get_commands(self): # Whether this is python or cpp runtime perf test. is_python = self._config.runtime == "python" num_gpus = self._config.num_gpus if is_python and num_gpus > 1: # TODO: Fix https://nvbugs/4449875 pytest.skip( "multi-gpu tests with python runtime is skipped because of hanging issue. See https://nvbugs/4449875" ) if is_windows() and num_gpus > 1: pytest.skip( "multi-gpu not supported on Windows yet, skipped for now") # Construct engine build command. engine_dir = self._get_engine_dir() convert_cmd = [] build_cmd = [] if self._build_script == "trtllm-build" and self._config.model_name in MODEL_PATH_DICT.keys( ): model_path = MODEL_PATH_DICT[self._config.model_name] model_dir = os.path.join(llm_models_root(), model_path) if not os.path.exists(engine_dir): os.makedirs(engine_dir, exist_ok=True) convert_cmd, checkpoint_dir = self.get_convert_weights_command( model_dir, engine_dir) if self._config.num_loras > 0: if self._config.model_name in LORA_MODEL_PATH.keys(): model_dir = os.path.join( llm_models_root(), LORA_MODEL_PATH[self._config.model_name]) convert_lora_cmd, lora_checkpoint_dir = self.get_convert_lora_weights_command( model_dir, engine_dir) convert_cmd += [";"] convert_cmd += convert_lora_cmd else: pytest.skip( f"There is no LoRA weights model for {self._config.model_name}" ) build_cmd = self.get_trtllm_build_command(engine_dir, checkpoint_dir) elif self._config.runtime == "bench": if self._config.backend == "pytorch": # Skip building process as it is pytorch backend") pass else: build_cmd = self.get_trtllm_bench_build_command(engine_dir) else: build_cmd = self.get_benchmark_build_command(engine_dir) # Construct prepare synthetic data command data_cmds = [] # Construct benchmark commands for each bs and seq len combination. benchmark_cmds = [] for bs in self._config.batch_sizes: for len_idx, input_len in enumerate(self._config.input_lens): output_len = None if self._config.is_bert_like( ) else self._config.output_lens[len_idx] if is_python: benchmark_cmd = self.get_python_runtime_benchmark_command( engine_dir, bs, input_len, output_len) elif self._config.runtime == "bench": benchmark_cmd = self.get_trtllm_bench_command(engine_dir) elif self._config.runtime == "cpp": benchmark_cmd = self.get_gpt_session_runtime_benchmark_command( engine_dir, bs, input_len, output_len) else: benchmark_cmd = self.get_gpt_manager_runtime_benchmark_command( engine_dir, bs, input_len) benchmark_cmds.append(benchmark_cmd) if not self._config.runtime == "cpp" and not is_python: data_cmd = self.get_prepare_data_command( engine_dir, input_len, output_len) data_cmds.append(data_cmd) # Construct MPI command. mpi_cmd = [] if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench": if cpu_socket_count_gt_1(): mpi_cmd = [ "mpirun", "--map-by", "socket", "-n", f"{num_gpus}", "--allow-run-as-root" ] else: mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"] if self._build_script == "trtllm-bench": return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds, mpi_cmd, is_python) else: return PerfScriptTestCmds(convert_cmd, build_cmd, data_cmds, benchmark_cmds, mpi_cmd, is_python) def get_perf_result(self, outputs: Dict[int, str]) -> float: """ Get perf metric result from test output logs. """ metric = self._current_metric cmd_idx = metric.cmd_idx metric_name = metric.metric_name num_gpus = self._config.num_gpus # Make sure we have outputs. assert cmd_idx in outputs, f"Output log for command {cmd_idx} does not exist!" # Use the regex to go through the log from the N-th command, where N = cmd_idx. print_info( f"Searching for metric {metric_name} from output log of command {cmd_idx} ..." ) regex_matches = [ metric.metric_regex.search(line) for line in outputs[cmd_idx].split("\n") ] metric_values = [ float(match.group(1)) for match in regex_matches if match ] if len(metric_values) == 0: if self._build_script == "trtllm-build" and metric.metric_type == PerfMetricType.ENGINE_SIZE: metric_values = [0.0] elif self._build_script == "trtllm-bench" and self._config.num_gpus > 1 and metric.metric_type == PerfMetricType.BUILD_TIME: print_info("skip building process for multi-gpu test" ) #https://nvbugspro.nvidia.com/bug/5210111 metric_values = [0.0] else: raise RuntimeError( f"Cannot find perf result for {metric_name} from perf script logs!" ) if metric.metric_type in BUILDER_METRICS and metric.metric_type != PerfMetricType.ENGINE_SIZE: # For enc-dec models, there are 2 builder perf metrics, we add them up. if self._config.is_enc_dec(): assert len( metric_values ) == 2 * num_gpus, f"Enc-Dec models must have num of metrics 2*{num_gpus} but got {len(metric_values)}!" enc_metrics = metric_values[:num_gpus] dec_metrics = metric_values[num_gpus:] gather_function = sum # Measure BUILD_PEAK_CPU_MEMORY, BUILD_PEAK_GPU_MEMORY by max function if metric.metric_type in [ PerfMetricType.BUILD_PEAK_CPU_MEMORY, PerfMetricType.BUILD_PEAK_GPU_MEMORY ]: gather_function = max metric_values = [ gather_function([x, y]) for x, y in zip(enc_metrics, dec_metrics) ] print_info( f"Combining up enc builder_perf {enc_metrics} and dec builder_perf {dec_metrics} to {metric_values}." ) # For other models, builder metric should equal # gpus. elif self._build_script != "trtllm-build" and self._build_script != "trtllm-bench": assert len( metric_values ) == num_gpus, f"num of metrics: {len(metric_values)} should match num_gpus: {num_gpus}" # Use max perf metrics across GPUS if len(metric_values) > 1: metric_value = max(metric_values) print_info( f"Use max value {metric_value} out of {metric_values} for perf metric {metric_name}." ) else: metric_value = metric_values[0] print_info( f"Use value {metric_value} for perf metric {metric_name}.") return metric_value def get_threshold(self) -> float: return self._current_metric.metric_threshold def get_absolute_threshold(self) -> float: return self._current_metric.metric_abs_threshold def get_metric_type(self) -> PerfMetricType: return self._current_metric.metric_type def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer, output_dir): """ Run through the commands and parse multiple perf metrics from the logs. """ #print info to separate cases print_info(f"Running perf test for case: {self._short_test_name}") self._current_cmd_idx = 0 metrics = self._get_metrics() outputs = {} result_states = {} errors = [] def add_myelin_time_pass_to(input_env): time_pass_flag = r" -time_pass=on" old_myelin_env = input_env.get("__LUNOWUD", "") if time_pass_flag not in old_myelin_env: input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag return old_myelin_env old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env) if self._config.runtime == 'bench': #prepare dataset first for trtllm-bench print_info(f"Running command for generating dataset") outputs = self.run_ex("prepare_dataset", llm_venv, gpu_clock_lock, session_data_writer, output_dir, outputs=outputs, original_test_name="prepare_dataset", cmd_idx=self._current_cmd_idx) # Save the result state. result_state = self.get_result_state() result_states[self._current_cmd_idx] = result_state if result_state != "valid": errors.append(self.get_error()) try: for metric in metrics: # Make sure that cmd_idx is in ascending order. assert metric.cmd_idx >= self._current_cmd_idx, "Command indices must be in ascending order!" self._current_cmd_idx = metric.cmd_idx self._current_metric = metric # If the same command has previously failed, do not run it again. if self._current_cmd_idx in result_states and result_states[ self._current_cmd_idx] == "failed": print_warning( f"Skipped running command for {metric.metric_name} since the previous run failed." ) continue # If engine build command already failed, do not run benchmark commands. if 0 in result_states and result_states[0] == "failed": print_warning( f"Skipped running command for {metric.metric_name} since the engine building command failed." ) continue # Run the command or reuse the existing output logs. print_info(f"Running command for {metric.metric_name}") outputs = self.run_ex( metric.metric_name, llm_venv, gpu_clock_lock, session_data_writer, output_dir, outputs=outputs, original_test_name=metric.original_test_name, cmd_idx=self._current_cmd_idx) # Save the result state. result_state = self.get_result_state() result_states[self._current_cmd_idx] = result_state if result_state != "valid": errors.append(self.get_error()) finally: # Clean up engine dir after use. shutil.rmtree(self._get_engine_dir(), ignore_errors=True) llm_venv._new_env["__LUNOWUD"] = old_llm_venv # Check if any commands failed. if not all([result_states[idx] == "valid" for idx in result_states]): # If there is only one error, throw it directly. if len(errors) == 1: raise errors[0] # Otherwise, combine all the error messages and re-raise a generic RuntimeError. msg = "Multiple Errors happened:\n" for error_idx, e in enumerate(errors): msg += f"> Error {error_idx+1}/{len(errors)}: {type(e).__name__}: {e}\n" raise RuntimeError(msg) def _get_engine_dir(self) -> str: """ Get the engine directory to store the engine. """ escaped_label = self._test_param_labels.replace("+", "_").replace( ":", "_").replace(",", "_") return os.path.join(self._working_dir, "perf_engines", escaped_label) def _get_metrics(self) -> List[PerfTestMetric]: """ Generate all the metric configs for the current test. """ metrics = [] # Build command is the first command. cmd_idx = 0 if self._config.runtime != "bench" else 1 if self._config.runtime == "bench": if self._config.backend == "pytorch": print_info( f"Skip building process for {self._config.model_name} as it is pytorch backend" ) builder_metrics = [] else: builder_metrics = [PerfMetricType.BUILD_TIME] else: builder_metrics = BUILDER_METRICS.copy() # Add all builder_perf metrics for metric_type in builder_metrics: metrics.append( PerfTestMetric( original_test_name=self._full_test_name, metric_name=self._get_metric_name(metric_type), metric_type=metric_type, metric_regex=self._get_metric_regex(metric_type), metric_threshold=self._get_metric_threshold(metric_type), metric_abs_threshold=self._get_metric_abs_threshold( metric_type), cmd_idx=cmd_idx, )) if self._config.build_only: return metrics # Then, construct inference latency and gpu mem usage metrics, for each # bs and each seq len. for bs in self._config.batch_sizes: for len_idx, input_len in enumerate(self._config.input_lens): cmd_idx += 1 output_len = None if self._config.is_bert_like( ) else self._config.output_lens[len_idx] # Get list of metrics depending on config. if self._config.runtime == "bench": metric_types = BENCH_INFERENCE_METRICS.copy() if self._config.streaming == "streaming": metric_types.append(PerfMetricType.FIRST_TOKEN_TIME) metric_types.append(PerfMetricType.OUTPUT_TOKEN_TIME) else: metric_types = INFERENCE_METRICS.copy() if self._config.runtime == "cpp": metric_types.append(PerfMetricType.TOKEN_THROUGHPUT) if self._config.runtime == "cppmanager": metric_types = MANAGER_INFERENCE_METRICS.copy() if self._config.streaming == "streaming": metric_types.append(PerfMetricType.FIRST_TOKEN_TIME) if self._config.mode != "plugin_ifb" or self._config.is_mamba_family( ): metric_types.remove(PerfMetricType.KV_CACHE_SIZE) if self._config.is_bert_like( ) and self._config.runtime == "cpp": # TODO: bertBenchmark does not report peak GPU memory yet. metric_types = BERT_CPP_INFERENCE_METRICS for metric_type in metric_types: metrics.append( PerfTestMetric( original_test_name=self._full_test_name, metric_name=self._get_metric_name( metric_type, bs, input_len, output_len), metric_type=metric_type, metric_regex=self._get_metric_regex(metric_type), metric_threshold=self._get_metric_threshold( metric_type), metric_abs_threshold=self._get_metric_abs_threshold( metric_type), cmd_idx=cmd_idx, )) return metrics def _get_metric_name(self, metric_type: PerfMetricType, bs: int = None, input_len: int = None, output_len: int = None) -> str: """ Construct the metric name for given metric_type, bs, input_len, and output_len. """ if metric_type in BUILDER_METRICS: # We build one engine for all benchmark runs, so add all bs and seq lens to the metric name. metric_label = self._config.to_string() else: # Otherwise, generate per-bs and per-seqlen label. metric_label = self._config.to_string( custom_bs=bs, custom_input_len=input_len, custom_output_len=output_len, ) metric_name = f"test_perf_metric_{metric_type.lower()}" return self._test_domain_name + "::" + metric_name + "[" + metric_label + "]" def _get_metric_regex(self, metric_type: PerfMetricType) -> re.Pattern: """ Get the regex used to parse the metric result for the metric type. """ if self._config.runtime == "bench": if metric_type not in BENCH_PERF_METRIC_LOG_QUERIES: raise ValueError(f"Unexpected metric_type: {metric_type}") return BENCH_PERF_METRIC_LOG_QUERIES[metric_type] else: if metric_type not in PERF_METRIC_LOG_QUERIES: raise ValueError(f"Unexpected metric_type: {metric_type}") return PERF_METRIC_LOG_QUERIES[metric_type] def _get_metric_threshold(self, metric_type: PerfMetricType) -> float: """ Get the threshold for the metric type. """ if metric_type not in PERF_METRIC_THRESHOLD: raise ValueError(f"Unexpected metric_type: {metric_type}") return PERF_METRIC_THRESHOLD[metric_type][0] def _get_metric_abs_threshold(self, metric_type: PerfMetricType) -> float: """ Get the absolute threshold for the metric type. """ if metric_type not in PERF_METRIC_THRESHOLD: raise ValueError(f"Unexpected metric_type: {metric_type}") return PERF_METRIC_THRESHOLD[metric_type][1] def run_perf_test(perf_case_name, trt_performance_cache_fpath, trt_gpu_clock_lock, llm_session_data_writer, output_dir, llm_venv, llm_root): """ The actual test definition for TensorRT LLM perf test. """ working_dir = llm_venv.get_working_directory() test_runner = MultiMetricPerfTest(perf_case_name) test_runner.set_runtime_configs(llm_root, working_dir, trt_performance_cache_fpath) test_runner.run_metrics(llm_venv, trt_gpu_clock_lock, llm_session_data_writer, output_dir) def generate_perf_tests(session, config, items): """ Generate all the perf tests based on test lists to speed up the test collection time. """ print_info(f"Dynamically generating perf tests...") valid_prefixes = [ "perf/test_perf.py::test_perf[", # TRT pipeline adds "llm/" prefix, so include it so that TRT-LLM perf tests can run in TRT pipelines. "llm/perf/test_perf.py::test_perf[", ] items = generate_test_nodes(session, config, items, valid_prefixes, run_perf_test) print_info(f"Completed generating perf tests.") return items