mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
tests: [TRTQA-2906] add benchmark serving tests (#4901)
Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com>
This commit is contained in:
parent
ddbaa5ef80
commit
1c3091c63b
@ -1433,6 +1433,20 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_trtllm_benchmark_serving(llm_root, llm_venv):
|
||||
example_root = Path(os.path.join(llm_root, "examples", "apps"))
|
||||
test_root = unittest_path() / "llmapi" / "apps"
|
||||
llm_venv.run_cmd([
|
||||
"-m", "pip", "install", "-r",
|
||||
os.path.join(example_root, "requirements.txt")
|
||||
])
|
||||
|
||||
llm_venv.run_cmd(
|
||||
["-m", "pytest",
|
||||
str(test_root / "_test_trtllm_serve_benchmark.py")])
|
||||
|
||||
|
||||
def test_build_time_benchmark_sanity(llm_root, llm_venv):
|
||||
temp = tempfile.TemporaryDirectory()
|
||||
llm_venv.run_cmd([
|
||||
|
||||
@ -490,6 +490,7 @@ test_e2e.py::test_mistral_e2e[use_py_session---]
|
||||
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
|
||||
test_e2e.py::test_openai_multi_chat_example
|
||||
test_e2e.py::test_openai_consistent_chat
|
||||
test_e2e.py::test_trtllm_benchmark_serving
|
||||
llmapi/test_llm_examples.py::test_llmapi_server_example
|
||||
# Pivot to Pytorch test cases.
|
||||
test_e2e.py::test_ptp_quickstart
|
||||
|
||||
60
tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
Normal file
60
tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
Normal file
@ -0,0 +1,60 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from utils.util import skip_gpu_memory_less_than_80gb
|
||||
|
||||
from .openai_server import RemoteOpenAIServer
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from test_llm import get_model_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_name():
|
||||
return "llama-3.1-model/Meta-Llama-3.1-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_path(model_name: str):
|
||||
return get_model_path(model_name)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(model_path: str):
|
||||
# fix port to facilitate concise trtllm-serve examples
|
||||
with RemoteOpenAIServer(model_path, port=8000) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def benchmark_root():
|
||||
llm_root = os.getenv("LLM_ROOT")
|
||||
return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
|
||||
|
||||
|
||||
def dataset_path(dataset_name: str):
|
||||
if dataset_name == "sharegpt":
|
||||
return get_model_path(
|
||||
"datasets/ShareGPT_V3_unfiltered_cleaned_split.json")
|
||||
else:
|
||||
raise ValueError(f"Invalid dataset name: {dataset_name}")
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
|
||||
model_path: str):
|
||||
client_script = os.path.join(benchmark_root, "benchmark_serving.py")
|
||||
dataset = dataset_path("sharegpt")
|
||||
benchmark_cmd = [
|
||||
"python3", client_script, "--dataset-name", "sharegpt", "--model",
|
||||
"llama", "--dataset-path", dataset, "--tokenizer", model_path
|
||||
]
|
||||
|
||||
# CalledProcessError will be raised if any errors occur
|
||||
subprocess.run(benchmark_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
check=True)
|
||||
Loading…
Reference in New Issue
Block a user