TensorRT-LLMs/tests/llmapi/apps/_test_openai_multi_gpu.py
Sharan Chetlur 258c7540c0 open source 09df54c0cc99354a60bbc0303e3e8ea33a96bef0 (#2725)
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>

open source f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd (#2736)

Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>

Add note for blackwell (#2742)

Update the docs to workaround the extra-index-url issue (#2744)

update README.md (#2751)

Fix github io pages (#2761)

Update
2025-02-11 02:21:51 +00:00

118 lines
3.2 KiB
Python

import os
import sys
import openai
import pytest
from openai_server import RemoteOpenAIServer
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from test_llm import get_model_path, prompts
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from utils.util import skip_single_gpu
@pytest.fixture(scope="module")
def model_name():
return "llama-models-v3/llama-v3-8b-instruct-hf"
@pytest.fixture(scope="module", params=[None, 'pytorch'])
def backend(request):
return request.param
@pytest.fixture(scope="module")
def server(model_name: str, backend: str):
model_path = get_model_path(model_name)
args = ["--tp_size", "2", "--max_beam_width", "1"]
if backend is not None:
args.append("--backend")
args.append(backend)
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server: RemoteOpenAIServer):
return server.get_client()
@pytest.fixture(scope="module")
def async_client(server: RemoteOpenAIServer):
return server.get_async_client()
@skip_single_gpu
def test_chat_tp2(client: openai.OpenAI, model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "What is the result of 1+1? Answer in one word: "
}]
chat_completion = client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=1,
)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
assert chat_completion.usage.completion_tokens == 1
message = chat_completion.choices[0].message
assert message.content == 'Two'
@skip_single_gpu
def test_completion_tp2(client: openai.OpenAI, model_name: str):
completion = client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
)
assert completion.choices[0].text == " D E F G H"
@skip_single_gpu
@pytest.mark.asyncio(loop_scope="module")
async def test_chat_streaming_tp2(async_client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "What is the result of 1+1? Answer in one word: "
}]
stream = await async_client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=1,
stream=True,
)
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
assert delta.content == "Two"
@skip_single_gpu
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_streaming_tp2(async_client: openai.AsyncOpenAI,
model_name: str):
completion = await async_client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
stream=True,
)
str_chunk = []
async for chunk in completion:
str_chunk.append(chunk.choices[0].text)
assert "".join(str_chunk) == " D E F G H"