mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[feat] Detokenize option in /v1/completions request (#5382)
Signed-off-by: Yegor <75512761+Wokzy@users.noreply.github.com> Signed-off-by: Yegor Yershov <yegor6741@gmail.com>
This commit is contained in:
parent
8836990bde
commit
5b7007a69d
@ -83,6 +83,7 @@ class CompletionLogProbs(OpenAIBaseModel):
|
||||
class CompletionResponseChoice(OpenAIBaseModel):
|
||||
index: int
|
||||
text: str
|
||||
token_ids: Optional[List[int]] = None
|
||||
logprobs: Optional[CompletionLogProbs] = None
|
||||
context_logits: Optional[Union[List[float], List[List[
|
||||
float]]]] = None # For reward models, the output is score logits instead of text.
|
||||
@ -112,6 +113,7 @@ class CompletionResponse(OpenAIBaseModel):
|
||||
class CompletionResponseStreamChoice(OpenAIBaseModel):
|
||||
index: int
|
||||
text: str
|
||||
token_ids: Optional[List[int]] = None
|
||||
logprobs: Optional[CompletionLogProbs] = None
|
||||
finish_reason: Optional[str] = None
|
||||
stop_reason: Optional[Union[int, str]] = Field(
|
||||
@ -187,6 +189,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
spaces_between_special_tokens: bool = True
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
|
||||
return_context_logits: bool = False
|
||||
detokenize: bool = True
|
||||
# doc: end-completion-sampling-params
|
||||
|
||||
# doc: begin-completion-extra-params
|
||||
@ -241,6 +244,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
return_context_logits=self.return_context_logits,
|
||||
guided_decoding=_response_format_to_guided_decoding_params(
|
||||
self.response_format),
|
||||
detokenize=self.detokenize,
|
||||
|
||||
# completion-extra-params
|
||||
add_special_tokens=self.add_special_tokens,
|
||||
|
||||
@ -257,6 +257,7 @@ class CompletionPostprocArgs(PostprocArgs):
|
||||
model: str = None
|
||||
num_choices: int = 1
|
||||
prompt_idx: int = 0
|
||||
detokenize: bool = True
|
||||
prompt: Optional[str] = None
|
||||
stream_options: Optional[StreamOptions] = None
|
||||
|
||||
@ -267,6 +268,7 @@ class CompletionPostprocArgs(PostprocArgs):
|
||||
model=request.model,
|
||||
num_choices=request.n if request.n else 1,
|
||||
stream_options=request.stream_options,
|
||||
detokenize=request.detokenize,
|
||||
)
|
||||
|
||||
|
||||
@ -287,7 +289,8 @@ def completion_stream_post_processor(rsp: DetokenizedGenerationResultBase, args:
|
||||
delta_text = args.prompt + delta_text
|
||||
choice = CompletionResponseStreamChoice(
|
||||
index=args.prompt_idx * args.num_choices + output.index,
|
||||
text=delta_text,
|
||||
text=delta_text if args.detokenize else "",
|
||||
token_ids=None if args.detokenize else output.token_ids_diff,
|
||||
finish_reason = output.finish_reason,
|
||||
stop_reason = output.stop_reason,
|
||||
)
|
||||
@ -327,7 +330,8 @@ def completion_response_post_processor(rsp: GenerationResult, args: CompletionPo
|
||||
text = args.prompt + text
|
||||
disaggregated_params = to_disaggregated_params(output.disaggregated_params)
|
||||
choice = CompletionResponseChoice(
|
||||
text=text,
|
||||
text=text if args.detokenize else "",
|
||||
token_ids=None if args.detokenize else output.token_ids,
|
||||
index=args.prompt_idx * args.num_choices + output.index,
|
||||
disaggregated_params=disaggregated_params,
|
||||
context_logits=None if rsp.context_logits is None else rsp.context_logits.tolist(),
|
||||
|
||||
@ -312,3 +312,60 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"continuous_usage_stats": True})
|
||||
|
||||
|
||||
def test_detokenize_single(client: openai.OpenAI, model_name):
|
||||
completion = client.completions.create(
|
||||
model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body=dict(detokenize=False),
|
||||
)
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert choice.text == ""
|
||||
assert isinstance(choice.token_ids, list)
|
||||
assert len(choice.token_ids) > 0
|
||||
|
||||
# test using token IDs
|
||||
completion = client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body=dict(detokenize=True),
|
||||
)
|
||||
|
||||
assert completion.choices[0].token_ids is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="module")
|
||||
async def test_completion_streaming(async_client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "Hello, my name is"
|
||||
|
||||
single_completion = await async_client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body=dict(detokenize=False),
|
||||
)
|
||||
single_output = single_completion.choices[0].token_ids
|
||||
stream = await async_client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body=dict(detokenize=False),
|
||||
)
|
||||
tokens: List[int] = []
|
||||
|
||||
async for chunk in stream:
|
||||
assert chunk.choices[0].text == ""
|
||||
assert isinstance(chunk.choices[0].token_ids, list)
|
||||
tokens.extend(chunk.choices[0].token_ids)
|
||||
|
||||
assert tokens == single_output
|
||||
|
||||
Loading…
Reference in New Issue
Block a user