[TRTLLM-9091] [feat] Replace GenAI-Perf with AIPerf (#9310)

Signed-off-by: lkomali <lkomali@nvidia.com>
Signed-off-by: Harshini Komali <157742537+lkomali@users.noreply.github.com>
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
This commit is contained in:
Harshini Komali 2025-12-22 21:25:55 -08:00 committed by GitHub
parent 5bc7ffe379
commit d691371eaf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 147 additions and 66 deletions

View File

@ -338,3 +338,85 @@ P99 ITL (ms): 6.14
- Control the number of images per request with `--random-num-images`
- Use `--random-image-width` and `--random-image-height` to specify image dimensions or `--random-image-size` for squared image dimensions.
- The `random_image` dataset generates synthetic images for benchmarking
## Benchmark using AIPerf
TensorRT-LLM also supports benchmarking `trtllm-serve` using [**AIPerf**](https://github.com/ai-dynamo/aiperf), NVIDIAs
comprehensive benchmarking tool for LLMs.
AIPerf provides throughput, latency, TTFT, and concurrency measurements for both
text and multimodal workloads.
AIPerf integrates directly with the OpenAI-compatible endpoints exposed by
`trtllm-serve`.
### Installation
AIPerf is installed with TensorRT-LLM by default.
### Running AIPerf with trtllm-serve
TensorRT-LLM provides example scripts under:
- `examples/serve/aiperf_client.sh`
- `examples/serve/aiperf_client_for_multimodal.sh`
These scripts demonstrate how to benchmark a running trtllm-serve instance using
the profile command in AIPerf.
### Example: Benchmark a text model
Once trtllm-serve is running on localhost:8000, run:
```bash
bash examples/serve/aiperf_client.sh
```
The script issues a profiling run:
```bash
aiperf profile \
-m TinyLlama-1.1B-Chat-v1.0 \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
--endpoint-type chat \
--random-seed 123 \
--synthetic-input-tokens-mean 128 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 128 \
--output-tokens-stddev 0 \
--request-count 100 \
--request-rate 10 \
--profile-export-file my_profile_export.json \
--url localhost:8000 \
--streaming
```
### Example: Benchmark a multimodal model
Benchmark multimodal inference using:
```bash
bash examples/serve/aiperf_client_for_multimodal.sh
```
This runs:
```bash
aiperf profile \
-m Qwen2.5-VL-3B-Instruct \
--tokenizer Qwen/Qwen2.5-VL-3B-Instruct \
--endpoint-type chat \
--random-seed 123 \
--image-width-mean 64 \
--image-height-mean 64 \
--image-format png \
--synthetic-input-tokens-mean 128 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 128 \
--output-tokens-stddev 0 \
--request-count 5 \
--request-rate 1 \
--profile-export-file my_profile_export.json \
--url localhost:8000 \
--streaming
```

View File

@ -1,6 +1,6 @@
#! /usr/bin/env bash
genai-perf profile \
aiperf profile \
-m TinyLlama-1.1B-Chat-v1.0 \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
--endpoint-type chat \

View File

@ -1,9 +1,9 @@
#! /usr/bin/env bash
genai-perf profile \
aiperf profile \
-m Qwen2.5-VL-3B-Instruct \
--tokenizer Qwen/Qwen2.5-VL-3B-Instruct \
--endpoint-type multimodal \
--endpoint-type chat \
--random-seed 123 \
--image-width-mean 64 \
--image-height-mean 64 \

View File

@ -1 +1 @@
genai-perf
aiperf

View File

@ -35,3 +35,4 @@ opentelemetry-sdk>=1.26.0
opentelemetry-api>=1.26.0
opentelemetry-exporter-otlp>=1.26.0
opentelemetry-semantic-conventions-ai>=0.4.1
aiperf==0.3.0

View File

@ -54,7 +54,7 @@ class WatchEventQueue:
loop = asyncio.get_event_loop()
for event in events:
self.events.put_nowait(event)
loop._write_to_self()
loop.call_soon_threadsafe(lambda: None)
class ClusterStorage(abc.ABC):
@ -268,7 +268,7 @@ class HttpClusterStorageServer(ClusterStorage):
logger.info(
f"Notifying watch event for watch key {watch_key} with type {event_type}"
)
loop._write_to_self()
loop.call_soon_threadsafe(lambda: None)
logger.info(
f"Notified watch event for key {key} with type {event_type}")
@ -426,7 +426,7 @@ class Etcd3WatchEventQueue(WatchEventQueue):
event_type=event_type,
))
if self.events._loop:
self.events._loop._write_to_self()
self.events._loop.call_soon_threadsafe(lambda: None)
except Exception as e:
logger.error(f"Error adding event: {e}")
self.cancel_event()

View File

@ -14,7 +14,7 @@
# limitations under the License.
"""
Stress test script for inference of model using TensorRT LLM with PyTorch/TRT backend.
This script is used for stress testing inference performance using trtllm-serve and genai-perf.
This script is used for stress testing inference performance using trtllm-serve and aiperf.
The script supports three test modes:
1. "stress-test": Runs performance test followed by stress test
@ -48,9 +48,9 @@ from defs.conftest import get_device_count, get_device_memory, llm_models_root
from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
print_warning)
# Install genai-perf in requirements-dev.txt will affect triton and pytorch version mismatch
# def genai_perf_install():
# """Ensures genai-perf is installed without affecting the global environment"""
# Install aiperf in requirements-dev.txt will affect triton and pytorch version mismatch
# def aiperf_install():
# """Ensures aiperf is installed without affecting the global environment"""
# import os
# import subprocess
@ -62,7 +62,7 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
# if not os.path.exists(requirements_file):
# with open(requirements_file, "w") as f:
# f.write("genai-perf\n")
# f.write("aiperf\n")
# subprocess.check_call(
# [sys.executable, "-m", "pip", "install", "-r", requirements_file])
@ -108,7 +108,7 @@ class ModelConfig:
@property
def model_name(self) -> str:
"""Extract model name from model_dir for genai-perf"""
"""Extract model name from model_dir for aiperf"""
return os.path.basename(self.model_dir)
@ -149,14 +149,14 @@ class StressTestConfig:
@property
def request_count_stress_test(self) -> int:
"""Calculate request count for stress test"""
# Cannot set exact stress time in genai-perf test, WR is set the stress_time as customized value to get request count
# Cannot set exact stress time in aiperf test, WR is set the stress_time as customized value to get request count
stress_request_count = self.customized_stress_request_rate * self.customized_stress_time
return stress_request_count
@dataclass(frozen=True)
class PerformanceParams:
"""Dataclass to store test parameters for genai-perf"""
"""Dataclass to store test parameters for aiperf"""
input_len_mean: int = 64 # customized for tinyllama and llama-v3-8b-instruct-hf
input_len_std: int = 16
output_len_mean: int = 128 # customized for tinyllama and llama-v3-8b-instruct-hf
@ -409,7 +409,7 @@ def stress_test(config,
server_config=None,
stress_time=None,
stress_timeout=None):
"""Test LLM model performance using trtllm-serve and genai-perf.
"""Test LLM model performance using trtllm-serve and aiperf.
This function supports multiple testing modes controlled by the --test-mode option:
- "stress-test": Runs the measure capacity stage first, then the stress stage,
@ -426,10 +426,10 @@ def stress_test(config,
stress_time: Optional stress time in seconds, overrides the default in StressTestConfig
stress_timeout: Optional stress timeout in seconds, overrides the default in StressTestConfig
"""
# Ensure genai-perf is installed
# genai_perf_install()
# Import genai-perf - needed after installation to make sure it's available
# import genai_perf # noqa: F401
# Ensure aiperf is installed
# aiperf_install()
# Import aiperf - needed after installation to make sure it's available
# import aiperf # noqa: F401
# Test mode handling - determine which tests to run
if test_mode == "stress-test":
@ -754,17 +754,17 @@ def stress_test(config,
os.unlink(extra_llm_options_path)
def create_genai_perf_command(model_name,
model_path,
request_count,
concurrency,
input_len_mean=PerformanceParams.input_len_mean,
input_len_std=PerformanceParams.input_len_std,
output_len_mean=PerformanceParams.output_len_mean,
output_len_std=PerformanceParams.output_len_std,
warmup_request_count=10):
def create_aiperf_command(model_name,
model_path,
request_count,
concurrency,
input_len_mean=PerformanceParams.input_len_mean,
input_len_std=PerformanceParams.input_len_std,
output_len_mean=PerformanceParams.output_len_mean,
output_len_std=PerformanceParams.output_len_std,
warmup_request_count=10):
"""
Create a command list for genai-perf with standardized parameters.
Create a command list for aiperf with standardized parameters.
Args:
model_name: Name of the model
@ -778,10 +778,10 @@ def create_genai_perf_command(model_name,
warmup_request_count: Number of warmup requests
Returns:
List of command-line arguments for genai-perf
List of command-line arguments for aiperf
"""
return [
"genai-perf",
"aiperf",
"profile",
"-m",
model_name,
@ -809,16 +809,16 @@ def create_genai_perf_command(model_name,
]
def run_genai_perf_process(cmd,
test_start_time,
test_timeout,
server_config,
request_counter=None):
def run_aiperf_process(cmd,
test_start_time,
test_timeout,
server_config,
request_counter=None):
"""
Run a genai-perf process and monitor both the process and server health.
Run a aiperf process and monitor both the process and server health.
Args:
cmd: Command list to execute genai-perf
cmd: Command list to execute aiperf
test_start_time: Start time of the test
test_timeout: Timeout for the test in seconds
server_config: Server configuration object
@ -827,7 +827,7 @@ def run_genai_perf_process(cmd,
Returns:
Boolean indicating whether the process completed successfully
"""
# Start genai-perf process with our context manager
# Start aiperf process with our context manager
with launch_process(cmd,
start_new_session=True,
filter_pattern=None,
@ -836,16 +836,16 @@ def run_genai_perf_process(cmd,
last_health_check = time.time()
process_completed = False
# Monitor both the server and genai-perf process
# Monitor both the server and aiperf process
while process.poll() is None:
current_time = time.time()
# Check if genai-perf is still running but exceeded timeout
# Check if aiperf is still running but exceeded timeout
elapsed_time = current_time - test_start_time
if elapsed_time > test_timeout:
cleanup_process_tree(process, has_session=True)
raise RuntimeError(
f"genai-perf test timed out after {test_timeout} seconds")
f"aiperf test timed out after {test_timeout} seconds")
# Check server health periodically
if current_time - last_health_check > server_config.health_check_timeout:
@ -869,20 +869,20 @@ def run_genai_perf_process(cmd,
time.sleep(0.5)
# Check final status of genai-perf process
# Check final status of aiperf process
retcode = process.poll()
if retcode is not None:
if retcode != 0:
cleanup_process_tree(process, has_session=True)
raise RuntimeError(
f"genai-perf exited with non-zero code: {retcode}")
f"aiperf exited with non-zero code: {retcode}")
else:
print_info("genai-perf completed successfully")
print_info("aiperf completed successfully")
process_completed = True
else:
cleanup_process_tree(process, has_session=True)
raise RuntimeError(
"genai-perf did not complete normally, will terminate")
"aiperf did not complete normally, will terminate")
return process_completed
@ -921,8 +921,8 @@ def measure_capacity_stage(model_name,
f"Running test {test_index+1}/{total_tests}: concurrency={concurrency}, request_count={request_count}"
)
# Prepare genai-perf command
cmd = create_genai_perf_command(
# Prepare aiperf command
cmd = create_aiperf_command(
model_name=model_name,
model_path=model_path,
request_count=request_count,
@ -933,10 +933,10 @@ def measure_capacity_stage(model_name,
output_len_std=performance_params.output_len_std,
warmup_request_count=10)
# Run genai-perf process
process_completed = run_genai_perf_process(
cmd, test_start_time, performance_params.test_timeout,
server_config, request_counter)
# Run aiperf process
process_completed = run_aiperf_process(cmd, test_start_time,
performance_params.test_timeout,
server_config, request_counter)
# Increment completed tests counter if the process completed successfully
if process_completed:
@ -1016,8 +1016,8 @@ def stress_stage(model_name,
if request_counter:
request_counter.reset()
# Prepare genai-perf command
cmd = create_genai_perf_command(
# Prepare aiperf command
cmd = create_aiperf_command(
model_name=model_name,
model_path=model_path,
request_count=request_count,
@ -1028,10 +1028,9 @@ def stress_stage(model_name,
output_len_std=PerformanceParams.output_len_std,
warmup_request_count=10)
# Start genai-perf process
process_completed = run_genai_perf_process(cmd, test_start_time,
test_timeout, server_config,
request_counter)
# Start aiperf process
process_completed = run_aiperf_process(cmd, test_start_time, test_timeout,
server_config, request_counter)
test_end_time = time.time()
duration = int(test_end_time - test_start_time)
@ -1183,14 +1182,14 @@ def extract_stress_test_metrics(artifacts_dir="./artifacts",
artifacts_dir (str): Path to the artifacts directory
current_model (str, optional): If provided, only analyze artifacts for this model
"""
# Find all profile_export_genai_perf.json files in the artifacts directory
# Find all profile_export_aiperf.json files in the artifacts directory
json_files = glob(os.path.join(artifacts_dir,
"**/profile_export_genai_perf.json"),
"**/profile_export_aiperf.json"),
recursive=True)
if not json_files:
raise RuntimeError(
"No profile_export_genai_perf.json files found in the artifacts directory"
"No profile_export_aiperf.json files found in the artifacts directory"
)
# Get a list of directory names in the artifacts directory
@ -1307,8 +1306,7 @@ def extract_stress_test_metrics(artifacts_dir="./artifacts",
range_val = max_val - min_val
if range_val == 0:
raise ValueError(
"Please check OutputTokenThroughput from genai-perf")
raise ValueError("Please check OutputTokenThroughput from aiperf")
else:
normalized_df.loc[
normalized_df["Model"] == model_name,

View File

@ -55,7 +55,7 @@ def example_root():
("python3", "openai_responses_client.py"),
("bash", "curl_chat_client.sh"),
("bash", "curl_completion_client.sh"),
("bash", "genai_perf_client.sh"),
("bash", "aiperf_client.sh"),
("bash", "curl_responses_client.sh")])
def test_trtllm_serve_examples(exe: str, script: str,
server: RemoteOpenAIServer, example_root: str):

View File

@ -60,7 +60,7 @@ def example_root():
@pytest.mark.parametrize("exe, script",
[("python3", "openai_chat_client_for_multimodal.py"),
("bash", "genai_perf_client_for_multimodal.sh")])
("bash", "aiperf_client_for_multimodal.sh")])
def test_trtllm_serve_examples(exe: str, script: str,
server: RemoteOpenAIServer, example_root: str):
client_script = os.path.join(example_root, script)