mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> open source f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd (#2736) Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Add note for blackwell (#2742) Update the docs to workaround the extra-index-url issue (#2744) update README.md (#2751) Fix github io pages (#2761) Update
118 lines
3.2 KiB
Python
118 lines
3.2 KiB
Python
import os
|
|
import sys
|
|
|
|
import openai
|
|
import pytest
|
|
from openai_server import RemoteOpenAIServer
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
|
from test_llm import get_model_path, prompts
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
|
|
from utils.util import skip_single_gpu
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def model_name():
|
|
return "llama-models-v3/llama-v3-8b-instruct-hf"
|
|
|
|
|
|
@pytest.fixture(scope="module", params=[None, 'pytorch'])
|
|
def backend(request):
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def server(model_name: str, backend: str):
|
|
model_path = get_model_path(model_name)
|
|
args = ["--tp_size", "2", "--max_beam_width", "1"]
|
|
if backend is not None:
|
|
args.append("--backend")
|
|
args.append(backend)
|
|
with RemoteOpenAIServer(model_path, args) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def client(server: RemoteOpenAIServer):
|
|
return server.get_client()
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def async_client(server: RemoteOpenAIServer):
|
|
return server.get_async_client()
|
|
|
|
|
|
@skip_single_gpu
|
|
def test_chat_tp2(client: openai.OpenAI, model_name: str):
|
|
messages = [{
|
|
"role": "system",
|
|
"content": "you are a helpful assistant"
|
|
}, {
|
|
"role": "user",
|
|
"content": "What is the result of 1+1? Answer in one word: "
|
|
}]
|
|
chat_completion = client.chat.completions.create(
|
|
model=model_name,
|
|
messages=messages,
|
|
max_tokens=1,
|
|
)
|
|
assert chat_completion.id is not None
|
|
assert len(chat_completion.choices) == 1
|
|
assert chat_completion.usage.completion_tokens == 1
|
|
message = chat_completion.choices[0].message
|
|
assert message.content == 'Two'
|
|
|
|
|
|
@skip_single_gpu
|
|
def test_completion_tp2(client: openai.OpenAI, model_name: str):
|
|
completion = client.completions.create(
|
|
model=model_name,
|
|
prompt=prompts,
|
|
max_tokens=5,
|
|
temperature=0.0,
|
|
)
|
|
assert completion.choices[0].text == " D E F G H"
|
|
|
|
|
|
@skip_single_gpu
|
|
@pytest.mark.asyncio(loop_scope="module")
|
|
async def test_chat_streaming_tp2(async_client: openai.AsyncOpenAI,
|
|
model_name: str):
|
|
messages = [{
|
|
"role": "system",
|
|
"content": "you are a helpful assistant"
|
|
}, {
|
|
"role": "user",
|
|
"content": "What is the result of 1+1? Answer in one word: "
|
|
}]
|
|
stream = await async_client.chat.completions.create(
|
|
model=model_name,
|
|
messages=messages,
|
|
max_tokens=1,
|
|
stream=True,
|
|
)
|
|
async for chunk in stream:
|
|
delta = chunk.choices[0].delta
|
|
if delta.role:
|
|
assert delta.role == "assistant"
|
|
if delta.content:
|
|
assert delta.content == "Two"
|
|
|
|
|
|
@skip_single_gpu
|
|
@pytest.mark.asyncio(loop_scope="module")
|
|
async def test_completion_streaming_tp2(async_client: openai.AsyncOpenAI,
|
|
model_name: str):
|
|
completion = await async_client.completions.create(
|
|
model=model_name,
|
|
prompt=prompts,
|
|
max_tokens=5,
|
|
temperature=0.0,
|
|
stream=True,
|
|
)
|
|
str_chunk = []
|
|
async for chunk in completion:
|
|
str_chunk.append(chunk.choices[0].text)
|
|
assert "".join(str_chunk) == " D E F G H"
|