mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-26 05:32:57 +08:00
131 lines
4.3 KiB
Python
Executable File
131 lines
4.3 KiB
Python
Executable File
import os
|
|
import time
|
|
|
|
import requests
|
|
from defs.conftest import llm_models_root, skip_pre_hopper
|
|
from defs.trt_test_alternative import popen, print_error, print_info
|
|
from openai import OpenAI
|
|
from requests.exceptions import RequestException
|
|
|
|
|
|
def check_server_ready(http_port="8000", timeout_timer=600, sleep_interval=5):
|
|
timer = 0
|
|
while timer <= timeout_timer:
|
|
try:
|
|
url = f"http://0.0.0.0:{http_port}/health"
|
|
r = requests.get(url, timeout=sleep_interval)
|
|
if r.status_code == 200:
|
|
# trtllm-serve health endpoint just returns status 200, no JSON body required
|
|
break
|
|
except RequestException:
|
|
pass
|
|
|
|
time.sleep(sleep_interval)
|
|
timer += sleep_interval
|
|
|
|
if timer > timeout_timer:
|
|
raise TimeoutError(
|
|
f"Error: Launch trtllm-serve timed out, timer is {timeout_timer} seconds."
|
|
)
|
|
|
|
print_info(
|
|
f"trtllm-serve launched successfully! Cost {timer} seconds to launch server."
|
|
)
|
|
|
|
|
|
def check_openai_chat_completion(http_port="8000",
|
|
model_name="TinyLlama-1.1B-Chat-v1.0"):
|
|
"""
|
|
Test the launched trtllm-serve server using OpenAI client.
|
|
|
|
Args:
|
|
http_port: The port where the server is running
|
|
model_name: The model name to use for the chat completion
|
|
"""
|
|
print_info("Testing trtllm-serve server with OpenAI chat completion...")
|
|
|
|
try:
|
|
# Create OpenAI client pointing to the local server
|
|
client = OpenAI(
|
|
base_url=f"http://localhost:{http_port}/v1",
|
|
api_key="tensorrt_llm",
|
|
)
|
|
|
|
# Make a chat completion request
|
|
response = client.chat.completions.create(
|
|
model=model_name,
|
|
messages=[{
|
|
"role": "system",
|
|
"content": "you are a helpful assistant"
|
|
}, {
|
|
"role": "user",
|
|
"content": "What is the capital of China?"
|
|
}],
|
|
max_tokens=50,
|
|
)
|
|
|
|
# Validate the response
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response,
|
|
'choices'), "Response should have choices attribute"
|
|
assert len(
|
|
response.choices) > 0, "Response should have at least one choice"
|
|
assert hasattr(response.choices[0],
|
|
'message'), "Choice should have message attribute"
|
|
assert hasattr(response.choices[0].message,
|
|
'content'), "Message should have content attribute"
|
|
|
|
content = response.choices[0].message.content
|
|
assert content is not None, "Message content should not be None"
|
|
assert len(content.strip()) > 0, "Message content should not be empty"
|
|
|
|
print_info(f"Chat completion test passed!")
|
|
print_info(
|
|
f"Model: {response.model if hasattr(response, 'model') else 'Unknown'}"
|
|
)
|
|
print_info(f"Response: {content}")
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
print_error(f"Chat completion test failed: {str(e)}")
|
|
raise
|
|
|
|
|
|
@skip_pre_hopper
|
|
def test_extra_llm_api_options(serve_test_root):
|
|
test_configs_root = f"{serve_test_root}/test_configs"
|
|
config_file = f"{test_configs_root}/Qwen3-30B-A3B-FP8.yml"
|
|
model_path = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8"
|
|
|
|
# Assert that required files and directories exist
|
|
assert os.path.exists(
|
|
test_configs_root
|
|
), f"test_configs_root directory does not exist: {test_configs_root}"
|
|
assert os.path.exists(
|
|
config_file), f"config_file does not exist: {config_file}"
|
|
assert os.path.exists(
|
|
model_path), f"model_path does not exist: {model_path}"
|
|
|
|
cmd = [
|
|
"trtllm-serve",
|
|
"serve",
|
|
model_path,
|
|
"--host",
|
|
"0.0.0.0",
|
|
"--port",
|
|
"8000",
|
|
"--backend",
|
|
"pytorch",
|
|
"--extra_llm_api_options",
|
|
config_file,
|
|
]
|
|
|
|
print_info("Launching trtllm-serve...")
|
|
with popen(cmd):
|
|
check_server_ready()
|
|
# Extract model name from the model path for consistency
|
|
model_name = model_path.split('/')[-1] # "Qwen3-30B-A3B-FP8"
|
|
# Test the server with OpenAI chat completion
|
|
check_openai_chat_completion(model_name=model_name)
|