TensorRT-LLMs/tests/unittest/llmapi/apps/_test_openai_completions.py
mpikulski 533add5056
[TRTLLM-8598][feat] enable n > 1 in OpenAI API with PyTorch backend (#8951)
Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
2025-11-07 17:47:35 -08:00

542 lines
21 KiB
Python

# Adapted from
# https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_completion.py
from typing import List
import openai
import pytest
from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer
from .utils import (invalid_logit_bias_helper, logit_bias_effect_helper,
make_server_with_custom_sampler_fixture)
@pytest.fixture(scope="module")
def model_name():
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@pytest.fixture(scope="module", params=["trt", "pytorch"])
def backend(request):
return request.param
@pytest.fixture(scope="module",
params=[0, 2],
ids=["disable_processpool", "enable_processpool"])
def num_postprocess_workers(request):
return request.param
@pytest.fixture(scope="module")
def server(model_name: str, backend: str, num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_with_beam_search(model_name: str, backend: str,
num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
args.extend(["--max_beam_width", "2"])
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server: RemoteOpenAIServer):
return server.get_client()
@pytest.fixture(scope="module")
def async_client(server: RemoteOpenAIServer):
return server.get_async_client()
@pytest.fixture(scope="module")
def async_client_with_beam_search(server_with_beam_search: RemoteOpenAIServer):
return server_with_beam_search.get_async_client()
def test_single_completion(client: openai.OpenAI, model_name):
completion = client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
completion_tokens = 5
prompt_tokens = 6
assert completion.usage.completion_tokens == completion_tokens
assert completion.usage.prompt_tokens == prompt_tokens
assert completion.usage.total_tokens == prompt_tokens + completion_tokens
# test using token IDs
completion = client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 1
def test_single_completion_with_too_long_prompt(client: openai.OpenAI,
model_name):
completion = client.completions.create(
model=model_name,
prompt="Hello, my name is" * 100,
max_tokens=5,
temperature=0.0,
)
print(completion)
@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("echo", [True, False])
async def test_completion_streaming(async_client: openai.AsyncOpenAI,
model_name: str, echo: bool):
# place this function here to avoid OAI completion bug
prompt = "Hello, my name is"
single_completion = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
echo=echo,
)
single_output = single_completion.choices[0].text
stream = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
echo=echo,
)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("prompts",
[["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2])
async def test_batch_completions(async_client: openai.AsyncOpenAI, model_name,
prompts):
# test simple list
batch = await async_client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("prompts",
[["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2])
async def test_batch_completions_beam_search(
async_client_with_beam_search: openai.AsyncOpenAI, model_name, prompts):
# test beam search
batch = await async_client_with_beam_search.completions.create(
model=model_name,
prompt=prompts,
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(use_beam_search=True),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[
1].text, "beam search should be different"
assert batch.choices[0].text == batch.choices[
2].text, "two copies of the same prompt should be the same"
assert batch.choices[1].text == batch.choices[
3].text, "two copies of the same prompt should be the same"
@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("prompts",
[["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2])
async def test_batch_completions_streaming(async_client: openai.AsyncOpenAI,
model_name, prompts):
# test streaming
batch = await async_client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_stream_options(async_client: openai.AsyncOpenAI,
model_name: str):
prompt = "Hello, my name is"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
False,
"continuous_usage_stats":
False,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
False,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
False,
})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is not None
assert chunk.usage.prompt_tokens > 0
assert chunk.usage.completion_tokens > 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if chunk.choices[0].finish_reason is not None:
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options=
# {"include_usage": None}
with pytest.raises(openai.BadRequestError):
await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options=
# {"include_usage": True}
with pytest.raises(openai.BadRequestError):
await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with pytest.raises(openai.BadRequestError):
await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": None})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with pytest.raises(openai.BadRequestError):
await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": True})
def test_detokenize_single(client: openai.OpenAI, model_name):
completion = client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=False),
)
choice = completion.choices[0]
assert choice.text == ""
assert isinstance(choice.token_ids, list)
assert len(choice.token_ids) > 0
# test using token IDs
completion = client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=True),
)
assert completion.choices[0].token_ids is None
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_streaming(async_client: openai.AsyncOpenAI,
model_name: str):
prompt = "Hello, my name is"
single_completion = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=False),
)
single_output = single_completion.choices[0].token_ids
stream = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
extra_body=dict(detokenize=False),
)
tokens: List[int] = []
async for chunk in stream:
assert chunk.choices[0].text == ""
assert isinstance(chunk.choices[0].token_ids, list)
tokens.extend(chunk.choices[0].token_ids)
assert tokens == single_output
# Use the shared fixture from utils.py
server_with_custom_sampler = make_server_with_custom_sampler_fixture(
'completions')
@pytest.mark.asyncio(loop_scope='function')
@pytest.mark.parametrize(
'server_with_custom_sampler',
[
{
'sampler_type': "TorchSampler"
}, # torch_sampler
{
'sampler_type': "TRTLLMSampler"
}, # trtllm_sampler
],
indirect=True,
ids=['torch_sampler', 'trtllm_sampler'])
async def test_completion_with_logit_bias_effect(
server_with_custom_sampler: RemoteOpenAIServer,
model_name: str) -> None:
'''Test that logit bias affects output as expected for both samplers (completions endpoint).'''
client = server_with_custom_sampler.get_async_client()
await logit_bias_effect_helper(client, model_name, 'completions')
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_with_invalid_logit_bias(
async_client: openai.AsyncOpenAI, model_name: str):
"""Test with invalid token IDs (non-integer keys)"""
await invalid_logit_bias_helper(async_client, model_name, 'completions')
def test_completion_cached_tokens(client: openai.OpenAI, model_name: str,
backend: str):
if backend == "trt":
pytest.skip("Cached tokens is not supported in trt backend yet")
prompt = "This is a test prompt"
# Run the completion for the first time
single_completion = client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
expected_cached_tokens = single_completion.usage.prompt_tokens - 1
# Run the completion for the second time
single_completion = client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
assert single_completion.usage is not None
assert single_completion.usage.prompt_tokens_details is not None
assert single_completion.usage.prompt_tokens_details.cached_tokens == expected_cached_tokens
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_cached_tokens_stream(async_client: openai.AsyncOpenAI,
model_name: str, backend: str):
if backend == "trt":
pytest.skip("Cached tokens is not supported in trt backend yet")
prompt = "This is a test prompt"
# Run the completion for the first time so that cached tokens are created
single_completion = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
expected_cached_tokens = single_completion.usage.prompt_tokens - 1
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
False,
})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.usage.prompt_tokens_details is not None
assert final_chunk.usage.prompt_tokens_details.cached_tokens == expected_cached_tokens
assert final_chunk.choices == []
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream = await async_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is not None
assert chunk.usage.prompt_tokens > 0
assert chunk.usage.completion_tokens > 0
assert chunk.usage.prompt_tokens_details is not None
assert chunk.usage.prompt_tokens_details.cached_tokens == expected_cached_tokens
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if chunk.choices[0].finish_reason is not None:
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.usage.prompt_tokens_details is not None
assert final_chunk.usage.prompt_tokens_details.cached_tokens == expected_cached_tokens
assert final_chunk.choices == []