tests: [TRTQA-2906] add benchmark serving tests (#4901)

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-06-05 14:33:03 +08:00 · 2025-06-05 14:33:03 +08:00 · 1c3091c63b
commit 1c3091c63b
parent ddbaa5ef80
3 changed files with 75 additions and 0 deletions
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@ -1433,6 +1433,20 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
    ])


+@pytest.mark.skip_less_device_memory(80000)
+def test_trtllm_benchmark_serving(llm_root, llm_venv):
+    example_root = Path(os.path.join(llm_root, "examples", "apps"))
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pip", "install", "-r",
+        os.path.join(example_root, "requirements.txt")
+    ])
+
+    llm_venv.run_cmd(
+        ["-m", "pytest",
+         str(test_root / "_test_trtllm_serve_benchmark.py")])
+
+
 def test_build_time_benchmark_sanity(llm_root, llm_venv):
    temp = tempfile.TemporaryDirectory()
    llm_venv.run_cmd([
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -490,6 +490,7 @@ test_e2e.py::test_mistral_e2e[use_py_session---]
 test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
 test_e2e.py::test_openai_multi_chat_example
 test_e2e.py::test_openai_consistent_chat
+test_e2e.py::test_trtllm_benchmark_serving
 llmapi/test_llm_examples.py::test_llmapi_server_example
 # Pivot to Pytorch test cases.
 test_e2e.py::test_ptp_quickstart
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
@ -0,0 +1,60 @@
+import os
+import subprocess
+import sys
+
+import pytest
+from utils.util import skip_gpu_memory_less_than_80gb
+
+from .openai_server import RemoteOpenAIServer
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from test_llm import get_model_path
+
+
+@pytest.fixture(scope="module")
+def model_name():
+    return "llama-3.1-model/Meta-Llama-3.1-8B"
+
+
+@pytest.fixture(scope="module")
+def model_path(model_name: str):
+    return get_model_path(model_name)
+
+
+@pytest.fixture(scope="module")
+def server(model_path: str):
+    # fix port to facilitate concise trtllm-serve examples
+    with RemoteOpenAIServer(model_path, port=8000) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def benchmark_root():
+    llm_root = os.getenv("LLM_ROOT")
+    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
+
+
+def dataset_path(dataset_name: str):
+    if dataset_name == "sharegpt":
+        return get_model_path(
+            "datasets/ShareGPT_V3_unfiltered_cleaned_split.json")
+    else:
+        raise ValueError(f"Invalid dataset name: {dataset_name}")
+
+
+@skip_gpu_memory_less_than_80gb
+def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
+                                model_path: str):
+    client_script = os.path.join(benchmark_root, "benchmark_serving.py")
+    dataset = dataset_path("sharegpt")
+    benchmark_cmd = [
+        "python3", client_script, "--dataset-name", "sharegpt", "--model",
+        "llama", "--dataset-path", dataset, "--tokenizer", model_path
+    ]
+
+    # CalledProcessError will be raised if any errors occur
+    subprocess.run(benchmark_cmd,
+                   stdout=subprocess.PIPE,
+                   stderr=subprocess.PIPE,
+                   text=True,
+                   check=True)