TensorRT-LLMs/tests/integration/defs/examples/serve/test_serve.py
Stanley Sun db8eb0a447
[TRTLLM-7876][test] Test trtllm-serve with --extra_llm_api_options (#7492)
Signed-off-by: Stanley Sun <190317771+StanleySun639@users.noreply.github.com>
2025-09-04 10:34:38 +08:00

131 lines
4.3 KiB
Python
Executable File

import os
import time
import requests
from defs.conftest import llm_models_root, skip_pre_hopper
from defs.trt_test_alternative import popen, print_error, print_info
from openai import OpenAI
from requests.exceptions import RequestException
def check_server_ready(http_port="8000", timeout_timer=600, sleep_interval=5):
timer = 0
while timer <= timeout_timer:
try:
url = f"http://0.0.0.0:{http_port}/health"
r = requests.get(url, timeout=sleep_interval)
if r.status_code == 200:
# trtllm-serve health endpoint just returns status 200, no JSON body required
break
except RequestException:
pass
time.sleep(sleep_interval)
timer += sleep_interval
if timer > timeout_timer:
raise TimeoutError(
f"Error: Launch trtllm-serve timed out, timer is {timeout_timer} seconds."
)
print_info(
f"trtllm-serve launched successfully! Cost {timer} seconds to launch server."
)
def check_openai_chat_completion(http_port="8000",
model_name="TinyLlama-1.1B-Chat-v1.0"):
"""
Test the launched trtllm-serve server using OpenAI client.
Args:
http_port: The port where the server is running
model_name: The model name to use for the chat completion
"""
print_info("Testing trtllm-serve server with OpenAI chat completion...")
try:
# Create OpenAI client pointing to the local server
client = OpenAI(
base_url=f"http://localhost:{http_port}/v1",
api_key="tensorrt_llm",
)
# Make a chat completion request
response = client.chat.completions.create(
model=model_name,
messages=[{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "What is the capital of China?"
}],
max_tokens=50,
)
# Validate the response
assert response is not None, "Response should not be None"
assert hasattr(response,
'choices'), "Response should have choices attribute"
assert len(
response.choices) > 0, "Response should have at least one choice"
assert hasattr(response.choices[0],
'message'), "Choice should have message attribute"
assert hasattr(response.choices[0].message,
'content'), "Message should have content attribute"
content = response.choices[0].message.content
assert content is not None, "Message content should not be None"
assert len(content.strip()) > 0, "Message content should not be empty"
print_info(f"Chat completion test passed!")
print_info(
f"Model: {response.model if hasattr(response, 'model') else 'Unknown'}"
)
print_info(f"Response: {content}")
return response
except Exception as e:
print_error(f"Chat completion test failed: {str(e)}")
raise
@skip_pre_hopper
def test_extra_llm_api_options(serve_test_root):
test_configs_root = f"{serve_test_root}/test_configs"
config_file = f"{test_configs_root}/Qwen3-30B-A3B-FP8.yml"
model_path = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8"
# Assert that required files and directories exist
assert os.path.exists(
test_configs_root
), f"test_configs_root directory does not exist: {test_configs_root}"
assert os.path.exists(
config_file), f"config_file does not exist: {config_file}"
assert os.path.exists(
model_path), f"model_path does not exist: {model_path}"
cmd = [
"trtllm-serve",
"serve",
model_path,
"--host",
"0.0.0.0",
"--port",
"8000",
"--backend",
"pytorch",
"--extra_llm_api_options",
config_file,
]
print_info("Launching trtllm-serve...")
with popen(cmd):
check_server_ready()
# Extract model name from the model path for consistency
model_name = model_path.split('/')[-1] # "Qwen3-30B-A3B-FP8"
# Test the server with OpenAI chat completion
check_openai_chat_completion(model_name=model_name)