mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
79 lines
2.3 KiB
Python
79 lines
2.3 KiB
Python
# Adapted from
|
|
# https://github.com/vllm-project/vllm/blob/baaedfdb2d3f1d70b7dbcde08b083abfe6017a92/tests/utils.py
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from typing import List
|
|
|
|
import openai
|
|
import requests
|
|
|
|
from tensorrt_llm.llmapi.mpi_session import find_free_port
|
|
|
|
|
|
class RemoteOpenAIServer:
|
|
DUMMY_API_KEY = "tensorrt_llm"
|
|
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
|
|
|
def __init__(
|
|
self,
|
|
model: str,
|
|
cli_args: List[str],
|
|
) -> None:
|
|
self.host = "localhost"
|
|
self.port = find_free_port()
|
|
|
|
cli_args += ["--host", f"{self.host}", "--port", f"{self.port}"]
|
|
self.proc = subprocess.Popen(["trtllm-serve"] + [model] + cli_args,
|
|
stdout=sys.stdout,
|
|
stderr=sys.stderr)
|
|
self._wait_for_server(url=self.url_for("health"),
|
|
timeout=self.MAX_SERVER_START_WAIT_S)
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
self.proc.terminate()
|
|
try:
|
|
self.proc.wait(timeout=30)
|
|
except subprocess.TimeoutExpired as e:
|
|
self.proc.kill()
|
|
self.proc.wait(timeout=30)
|
|
|
|
def _wait_for_server(self, *, url: str, timeout: float):
|
|
# run health check
|
|
start = time.time()
|
|
while True:
|
|
try:
|
|
if requests.get(url).status_code == 200:
|
|
break
|
|
except Exception as err:
|
|
result = self.proc.poll()
|
|
if result is not None and result != 0:
|
|
raise RuntimeError("Server exited unexpectedly.") from err
|
|
|
|
time.sleep(0.5)
|
|
if time.time() - start > timeout:
|
|
raise RuntimeError(
|
|
"Server failed to start in time.") from err
|
|
|
|
@property
|
|
def url_root(self) -> str:
|
|
return f"http://{self.host}:{self.port}"
|
|
|
|
def url_for(self, *parts: str) -> str:
|
|
return self.url_root + "/" + "/".join(parts)
|
|
|
|
def get_client(self):
|
|
return openai.OpenAI(
|
|
base_url=self.url_for("v1"),
|
|
api_key=self.DUMMY_API_KEY,
|
|
)
|
|
|
|
def get_async_client(self):
|
|
return openai.AsyncOpenAI(
|
|
base_url=self.url_for("v1"),
|
|
api_key=self.DUMMY_API_KEY,
|
|
)
|