diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 26bcdee330..c907ae0ac7 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1433,6 +1433,20 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv): ]) +@pytest.mark.skip_less_device_memory(80000) +def test_trtllm_benchmark_serving(llm_root, llm_venv): + example_root = Path(os.path.join(llm_root, "examples", "apps")) + test_root = unittest_path() / "llmapi" / "apps" + llm_venv.run_cmd([ + "-m", "pip", "install", "-r", + os.path.join(example_root, "requirements.txt") + ]) + + llm_venv.run_cmd( + ["-m", "pytest", + str(test_root / "_test_trtllm_serve_benchmark.py")]) + + def test_build_time_benchmark_sanity(llm_root, llm_venv): temp = tempfile.TemporaryDirectory() llm_venv.run_cmd([ diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 21a5f7d437..36aad365ce 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -490,6 +490,7 @@ test_e2e.py::test_mistral_e2e[use_py_session---] test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B] test_e2e.py::test_openai_multi_chat_example test_e2e.py::test_openai_consistent_chat +test_e2e.py::test_trtllm_benchmark_serving llmapi/test_llm_examples.py::test_llmapi_server_example # Pivot to Pytorch test cases. test_e2e.py::test_ptp_quickstart diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py new file mode 100644 index 0000000000..60be6c7db6 --- /dev/null +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py @@ -0,0 +1,60 @@ +import os +import subprocess +import sys + +import pytest +from utils.util import skip_gpu_memory_less_than_80gb + +from .openai_server import RemoteOpenAIServer + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from test_llm import get_model_path + + +@pytest.fixture(scope="module") +def model_name(): + return "llama-3.1-model/Meta-Llama-3.1-8B" + + +@pytest.fixture(scope="module") +def model_path(model_name: str): + return get_model_path(model_name) + + +@pytest.fixture(scope="module") +def server(model_path: str): + # fix port to facilitate concise trtllm-serve examples + with RemoteOpenAIServer(model_path, port=8000) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def benchmark_root(): + llm_root = os.getenv("LLM_ROOT") + return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts") + + +def dataset_path(dataset_name: str): + if dataset_name == "sharegpt": + return get_model_path( + "datasets/ShareGPT_V3_unfiltered_cleaned_split.json") + else: + raise ValueError(f"Invalid dataset name: {dataset_name}") + + +@skip_gpu_memory_less_than_80gb +def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str, + model_path: str): + client_script = os.path.join(benchmark_root, "benchmark_serving.py") + dataset = dataset_path("sharegpt") + benchmark_cmd = [ + "python3", client_script, "--dataset-name", "sharegpt", "--model", + "llama", "--dataset-path", dataset, "--tokenizer", model_path + ] + + # CalledProcessError will be raised if any errors occur + subprocess.run(benchmark_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True)