[TRTLLM-9091] [feat] Replace GenAI-Perf with AIPerf (#9310)

Signed-off-by: lkomali <lkomali@nvidia.com> Signed-off-by: Harshini Komali <157742537+lkomali@users.noreply.github.com> Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-12-22 21:25:55 -08:00 · 2025-12-22 21:25:55 -08:00 · d691371eaf
commit d691371eaf
parent 5bc7ffe379
9 changed files with 147 additions and 66 deletions
--- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
+++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
@ -338,3 +338,85 @@ P99 ITL (ms):                            6.14
 - Control the number of images per request with `--random-num-images`
 - Use `--random-image-width` and `--random-image-height` to specify image dimensions or `--random-image-size` for squared image dimensions.
 - The `random_image` dataset generates synthetic images for benchmarking
+
+
+## Benchmark using AIPerf
+
+TensorRT-LLM also supports benchmarking `trtllm-serve` using [**AIPerf**](https://github.com/ai-dynamo/aiperf), NVIDIA’s
+comprehensive benchmarking tool for LLMs.  
+AIPerf provides throughput, latency, TTFT, and concurrency measurements for both
+text and multimodal workloads.
+
+AIPerf integrates directly with the OpenAI-compatible endpoints exposed by
+`trtllm-serve`.
+
+### Installation
+
+AIPerf is installed with TensorRT-LLM by default.  
+
+
+### Running AIPerf with trtllm-serve
+TensorRT-LLM provides example scripts under:
+
+- `examples/serve/aiperf_client.sh`
+- `examples/serve/aiperf_client_for_multimodal.sh`
+
+These scripts demonstrate how to benchmark a running trtllm-serve instance using
+the profile command in AIPerf.
+
+### Example: Benchmark a text model
+
+Once trtllm-serve is running on localhost:8000, run:
+
+```bash
+bash examples/serve/aiperf_client.sh
+```
+
+The script issues a profiling run:
+
+```bash
+aiperf profile \
+    -m TinyLlama-1.1B-Chat-v1.0 \
+    --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --endpoint-type chat \
+    --random-seed 123 \
+    --synthetic-input-tokens-mean 128 \
+    --synthetic-input-tokens-stddev 0 \
+    --output-tokens-mean 128 \
+    --output-tokens-stddev 0 \
+    --request-count 100 \
+    --request-rate 10 \
+    --profile-export-file my_profile_export.json \
+    --url localhost:8000 \
+    --streaming
+```
+
+### Example: Benchmark a multimodal model
+
+Benchmark multimodal inference using:
+
+```bash
+bash examples/serve/aiperf_client_for_multimodal.sh
+```
+
+This runs:
+
+```bash
+aiperf profile \
+    -m Qwen2.5-VL-3B-Instruct \
+    --tokenizer Qwen/Qwen2.5-VL-3B-Instruct \
+    --endpoint-type chat \
+    --random-seed 123 \
+    --image-width-mean 64 \
+    --image-height-mean 64 \
+    --image-format png \
+    --synthetic-input-tokens-mean 128 \
+    --synthetic-input-tokens-stddev 0 \
+    --output-tokens-mean 128 \
+    --output-tokens-stddev 0 \
+    --request-count 5 \
+    --request-rate 1 \
+    --profile-export-file my_profile_export.json \
+    --url localhost:8000 \
+    --streaming
+```
--- a/examples/serve/genai_perf_client.sh
+++ b/examples/serve/genai_perf_client.sh
@ -1,6 +1,6 @@
 #! /usr/bin/env bash

-genai-perf profile \
+aiperf profile \
    -m TinyLlama-1.1B-Chat-v1.0 \
    --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
    --endpoint-type chat \
--- a/examples/serve/genai_perf_client_for_multimodal.sh
+++ b/examples/serve/genai_perf_client_for_multimodal.sh
@ -1,9 +1,9 @@
 #! /usr/bin/env bash

-genai-perf profile \
+aiperf profile \
    -m Qwen2.5-VL-3B-Instruct \
    --tokenizer Qwen/Qwen2.5-VL-3B-Instruct \
-    --endpoint-type multimodal \
+    --endpoint-type chat \
    --random-seed 123 \
    --image-width-mean 64 \
    --image-height-mean 64 \
--- a/examples/serve/requirements.txt
+++ b/examples/serve/requirements.txt
@ -1 +1 @@
-genai-perf
+aiperf
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -35,3 +35,4 @@ opentelemetry-sdk>=1.26.0
 opentelemetry-api>=1.26.0
 opentelemetry-exporter-otlp>=1.26.0
 opentelemetry-semantic-conventions-ai>=0.4.1
+aiperf==0.3.0
--- a/tensorrt_llm/serve/cluster_storage.py
+++ b/tensorrt_llm/serve/cluster_storage.py
@ -54,7 +54,7 @@ class WatchEventQueue:
        loop = asyncio.get_event_loop()
        for event in events:
            self.events.put_nowait(event)
-        loop._write_to_self()
+        loop.call_soon_threadsafe(lambda: None)


 class ClusterStorage(abc.ABC):
@ -268,7 +268,7 @@ class HttpClusterStorageServer(ClusterStorage):
                logger.info(
                    f"Notifying watch event for watch key {watch_key} with type {event_type}"
                )
-            loop._write_to_self()
+            loop.call_soon_threadsafe(lambda: None)
        logger.info(
            f"Notified watch event for key {key} with type {event_type}")

@ -426,7 +426,7 @@ class Etcd3WatchEventQueue(WatchEventQueue):
                        event_type=event_type,
                    ))
            if self.events._loop:
-                self.events._loop._write_to_self()
+                self.events._loop.call_soon_threadsafe(lambda: None)
        except Exception as e:
            logger.error(f"Error adding event: {e}")
            self.cancel_event()
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@ -14,7 +14,7 @@
 # limitations under the License.
 """
 Stress test script for inference of model using TensorRT LLM with PyTorch/TRT backend.
-This script is used for stress testing inference performance using trtllm-serve and genai-perf.
+This script is used for stress testing inference performance using trtllm-serve and aiperf.

 The script supports three test modes:
 1. "stress-test": Runs performance test followed by stress test
@ -48,9 +48,9 @@ from defs.conftest import get_device_count, get_device_memory, llm_models_root
 from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
                                       print_warning)

-# Install genai-perf in requirements-dev.txt will affect triton and pytorch version mismatch
-# def genai_perf_install():
-#     """Ensures genai-perf is installed without affecting the global environment"""
+# Install aiperf in requirements-dev.txt will affect triton and pytorch version mismatch
+# def aiperf_install():
+#     """Ensures aiperf is installed without affecting the global environment"""

 #     import os
 #     import subprocess
@ -62,7 +62,7 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,

 #     if not os.path.exists(requirements_file):
 #         with open(requirements_file, "w") as f:
-#             f.write("genai-perf\n")
+#             f.write("aiperf\n")

 #     subprocess.check_call(
 #         [sys.executable, "-m", "pip", "install", "-r", requirements_file])
@ -108,7 +108,7 @@ class ModelConfig:

    @property
    def model_name(self) -> str:
-        """Extract model name from model_dir for genai-perf"""
+        """Extract model name from model_dir for aiperf"""
        return os.path.basename(self.model_dir)


@ -149,14 +149,14 @@ class StressTestConfig:
    @property
    def request_count_stress_test(self) -> int:
        """Calculate request count for stress test"""
-        # Cannot set exact stress time in genai-perf test, WR is set the stress_time as customized value to get request count
+        # Cannot set exact stress time in aiperf test, WR is set the stress_time as customized value to get request count
        stress_request_count = self.customized_stress_request_rate * self.customized_stress_time
        return stress_request_count


@dataclass(frozen=True)
 class PerformanceParams:
-    """Dataclass to store test parameters for genai-perf"""
+    """Dataclass to store test parameters for aiperf"""
    input_len_mean: int = 64  # customized for tinyllama and llama-v3-8b-instruct-hf
    input_len_std: int = 16
    output_len_mean: int = 128  # customized for tinyllama and llama-v3-8b-instruct-hf
@ -409,7 +409,7 @@ def stress_test(config,
                server_config=None,
                stress_time=None,
                stress_timeout=None):
-    """Test LLM model performance using trtllm-serve and genai-perf.
+    """Test LLM model performance using trtllm-serve and aiperf.

    This function supports multiple testing modes controlled by the --test-mode option:
    - "stress-test": Runs the measure capacity stage first, then the stress stage,
@ -426,10 +426,10 @@ def stress_test(config,
        stress_time: Optional stress time in seconds, overrides the default in StressTestConfig
        stress_timeout: Optional stress timeout in seconds, overrides the default in StressTestConfig
    """
-    # Ensure genai-perf is installed
-    # genai_perf_install()
-    # Import genai-perf - needed after installation to make sure it's available
-    # import genai_perf  # noqa: F401
+    # Ensure aiperf is installed
+    # aiperf_install()
+    # Import aiperf - needed after installation to make sure it's available
+    # import aiperf  # noqa: F401

    # Test mode handling - determine which tests to run
    if test_mode == "stress-test":
@ -754,17 +754,17 @@ def stress_test(config,
            os.unlink(extra_llm_options_path)


-def create_genai_perf_command(model_name,
-                              model_path,
-                              request_count,
-                              concurrency,
-                              input_len_mean=PerformanceParams.input_len_mean,
-                              input_len_std=PerformanceParams.input_len_std,
-                              output_len_mean=PerformanceParams.output_len_mean,
-                              output_len_std=PerformanceParams.output_len_std,
-                              warmup_request_count=10):
+def create_aiperf_command(model_name,
+                          model_path,
+                          request_count,
+                          concurrency,
+                          input_len_mean=PerformanceParams.input_len_mean,
+                          input_len_std=PerformanceParams.input_len_std,
+                          output_len_mean=PerformanceParams.output_len_mean,
+                          output_len_std=PerformanceParams.output_len_std,
+                          warmup_request_count=10):
    """
-    Create a command list for genai-perf with standardized parameters.
+    Create a command list for aiperf with standardized parameters.

    Args:
        model_name: Name of the model
@ -778,10 +778,10 @@ def create_genai_perf_command(model_name,
        warmup_request_count: Number of warmup requests

    Returns:
-        List of command-line arguments for genai-perf
+        List of command-line arguments for aiperf
    """
    return [
-        "genai-perf",
+        "aiperf",
        "profile",
        "-m",
        model_name,
@ -809,16 +809,16 @@ def create_genai_perf_command(model_name,
    ]


-def run_genai_perf_process(cmd,
-                           test_start_time,
-                           test_timeout,
-                           server_config,
-                           request_counter=None):
+def run_aiperf_process(cmd,
+                       test_start_time,
+                       test_timeout,
+                       server_config,
+                       request_counter=None):
    """
-    Run a genai-perf process and monitor both the process and server health.
+    Run a aiperf process and monitor both the process and server health.

    Args:
-        cmd: Command list to execute genai-perf
+        cmd: Command list to execute aiperf
        test_start_time: Start time of the test
        test_timeout: Timeout for the test in seconds
        server_config: Server configuration object
@ -827,7 +827,7 @@ def run_genai_perf_process(cmd,
    Returns:
        Boolean indicating whether the process completed successfully
    """
-    # Start genai-perf process with our context manager
+    # Start aiperf process with our context manager
    with launch_process(cmd,
                        start_new_session=True,
                        filter_pattern=None,
@ -836,16 +836,16 @@ def run_genai_perf_process(cmd,
        last_health_check = time.time()
        process_completed = False

-        # Monitor both the server and genai-perf process
+        # Monitor both the server and aiperf process
        while process.poll() is None:
            current_time = time.time()

-            # Check if genai-perf is still running but exceeded timeout
+            # Check if aiperf is still running but exceeded timeout
            elapsed_time = current_time - test_start_time
            if elapsed_time > test_timeout:
                cleanup_process_tree(process, has_session=True)
                raise RuntimeError(
-                    f"genai-perf test timed out after {test_timeout} seconds")
+                    f"aiperf test timed out after {test_timeout} seconds")

            # Check server health periodically
            if current_time - last_health_check > server_config.health_check_timeout:
@ -869,20 +869,20 @@ def run_genai_perf_process(cmd,

            time.sleep(0.5)

-        # Check final status of genai-perf process
+        # Check final status of aiperf process
        retcode = process.poll()
        if retcode is not None:
            if retcode != 0:
                cleanup_process_tree(process, has_session=True)
                raise RuntimeError(
-                    f"genai-perf exited with non-zero code: {retcode}")
+                    f"aiperf exited with non-zero code: {retcode}")
            else:
-                print_info("genai-perf completed successfully")
+                print_info("aiperf completed successfully")
                process_completed = True
        else:
            cleanup_process_tree(process, has_session=True)
            raise RuntimeError(
-                "genai-perf did not complete normally, will terminate")
+                "aiperf did not complete normally, will terminate")

    return process_completed

@ -921,8 +921,8 @@ def measure_capacity_stage(model_name,
            f"Running test {test_index+1}/{total_tests}: concurrency={concurrency}, request_count={request_count}"
        )

-        # Prepare genai-perf command
-        cmd = create_genai_perf_command(
+        # Prepare aiperf command
+        cmd = create_aiperf_command(
            model_name=model_name,
            model_path=model_path,
            request_count=request_count,
@ -933,10 +933,10 @@ def measure_capacity_stage(model_name,
            output_len_std=performance_params.output_len_std,
            warmup_request_count=10)

-        # Run genai-perf process
-        process_completed = run_genai_perf_process(
-            cmd, test_start_time, performance_params.test_timeout,
-            server_config, request_counter)
+        # Run aiperf process
+        process_completed = run_aiperf_process(cmd, test_start_time,
+                                               performance_params.test_timeout,
+                                               server_config, request_counter)

        # Increment completed tests counter if the process completed successfully
        if process_completed:
@ -1016,8 +1016,8 @@ def stress_stage(model_name,
    if request_counter:
        request_counter.reset()

-    # Prepare genai-perf command
-    cmd = create_genai_perf_command(
+    # Prepare aiperf command
+    cmd = create_aiperf_command(
        model_name=model_name,
        model_path=model_path,
        request_count=request_count,
@ -1028,10 +1028,9 @@ def stress_stage(model_name,
        output_len_std=PerformanceParams.output_len_std,
        warmup_request_count=10)

-    # Start genai-perf process
-    process_completed = run_genai_perf_process(cmd, test_start_time,
-                                               test_timeout, server_config,
-                                               request_counter)
+    # Start aiperf process
+    process_completed = run_aiperf_process(cmd, test_start_time, test_timeout,
+                                           server_config, request_counter)

    test_end_time = time.time()
    duration = int(test_end_time - test_start_time)
@ -1183,14 +1182,14 @@ def extract_stress_test_metrics(artifacts_dir="./artifacts",
        artifacts_dir (str): Path to the artifacts directory
        current_model (str, optional): If provided, only analyze artifacts for this model
    """
-    # Find all profile_export_genai_perf.json files in the artifacts directory
+    # Find all profile_export_aiperf.json files in the artifacts directory
    json_files = glob(os.path.join(artifacts_dir,
-                                   "**/profile_export_genai_perf.json"),
+                                   "**/profile_export_aiperf.json"),
                      recursive=True)

    if not json_files:
        raise RuntimeError(
-            "No profile_export_genai_perf.json files found in the artifacts directory"
+            "No profile_export_aiperf.json files found in the artifacts directory"
        )

    # Get a list of directory names in the artifacts directory
@ -1307,8 +1306,7 @@ def extract_stress_test_metrics(artifacts_dir="./artifacts",

        range_val = max_val - min_val
        if range_val == 0:
-            raise ValueError(
-                "Please check OutputTokenThroughput from genai-perf")
+            raise ValueError("Please check OutputTokenThroughput from aiperf")
        else:
            normalized_df.loc[
                normalized_df["Model"] == model_name,
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py
@ -55,7 +55,7 @@ def example_root():
                    ("python3", "openai_responses_client.py"),
                    ("bash", "curl_chat_client.sh"),
                    ("bash", "curl_completion_client.sh"),
-                    ("bash", "genai_perf_client.sh"),
+                    ("bash", "aiperf_client.sh"),
                    ("bash", "curl_responses_client.sh")])
 def test_trtllm_serve_examples(exe: str, script: str,
                               server: RemoteOpenAIServer, example_root: str):
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
@ -60,7 +60,7 @@ def example_root():

@pytest.mark.parametrize("exe, script",
                         [("python3", "openai_chat_client_for_multimodal.py"),
-                          ("bash", "genai_perf_client_for_multimodal.sh")])
+                          ("bash", "aiperf_client_for_multimodal.sh")])
 def test_trtllm_serve_examples(exe: str, script: str,
                               server: RemoteOpenAIServer, example_root: str):
    client_script = os.path.join(example_root, script)