From 5b7007a69d65aa43f643dc6843ed612d61293e2d Mon Sep 17 00:00:00 2001 From: Yegor <75512761+Wokzy@users.noreply.github.com> Date: Tue, 8 Jul 2025 18:36:04 +0700 Subject: [PATCH] [feat] Detokenize option in /v1/completions request (#5382) Signed-off-by: Yegor <75512761+Wokzy@users.noreply.github.com> Signed-off-by: Yegor Yershov --- tensorrt_llm/serve/openai_protocol.py | 4 ++ tensorrt_llm/serve/postprocess_handlers.py | 8 ++- .../llmapi/apps/_test_openai_completions.py | 57 +++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py index 03ad7df27b..90bc9e54be 100644 --- a/tensorrt_llm/serve/openai_protocol.py +++ b/tensorrt_llm/serve/openai_protocol.py @@ -83,6 +83,7 @@ class CompletionLogProbs(OpenAIBaseModel): class CompletionResponseChoice(OpenAIBaseModel): index: int text: str + token_ids: Optional[List[int]] = None logprobs: Optional[CompletionLogProbs] = None context_logits: Optional[Union[List[float], List[List[ float]]]] = None # For reward models, the output is score logits instead of text. @@ -112,6 +113,7 @@ class CompletionResponse(OpenAIBaseModel): class CompletionResponseStreamChoice(OpenAIBaseModel): index: int text: str + token_ids: Optional[List[int]] = None logprobs: Optional[CompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = Field( @@ -187,6 +189,7 @@ class CompletionRequest(OpenAIBaseModel): spaces_between_special_tokens: bool = True truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None return_context_logits: bool = False + detokenize: bool = True # doc: end-completion-sampling-params # doc: begin-completion-extra-params @@ -241,6 +244,7 @@ class CompletionRequest(OpenAIBaseModel): return_context_logits=self.return_context_logits, guided_decoding=_response_format_to_guided_decoding_params( self.response_format), + detokenize=self.detokenize, # completion-extra-params add_special_tokens=self.add_special_tokens, diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py index 321ff6cc90..ee199994c4 100644 --- a/tensorrt_llm/serve/postprocess_handlers.py +++ b/tensorrt_llm/serve/postprocess_handlers.py @@ -257,6 +257,7 @@ class CompletionPostprocArgs(PostprocArgs): model: str = None num_choices: int = 1 prompt_idx: int = 0 + detokenize: bool = True prompt: Optional[str] = None stream_options: Optional[StreamOptions] = None @@ -267,6 +268,7 @@ class CompletionPostprocArgs(PostprocArgs): model=request.model, num_choices=request.n if request.n else 1, stream_options=request.stream_options, + detokenize=request.detokenize, ) @@ -287,7 +289,8 @@ def completion_stream_post_processor(rsp: DetokenizedGenerationResultBase, args: delta_text = args.prompt + delta_text choice = CompletionResponseStreamChoice( index=args.prompt_idx * args.num_choices + output.index, - text=delta_text, + text=delta_text if args.detokenize else "", + token_ids=None if args.detokenize else output.token_ids_diff, finish_reason = output.finish_reason, stop_reason = output.stop_reason, ) @@ -327,7 +330,8 @@ def completion_response_post_processor(rsp: GenerationResult, args: CompletionPo text = args.prompt + text disaggregated_params = to_disaggregated_params(output.disaggregated_params) choice = CompletionResponseChoice( - text=text, + text=text if args.detokenize else "", + token_ids=None if args.detokenize else output.token_ids, index=args.prompt_idx * args.num_choices + output.index, disaggregated_params=disaggregated_params, context_logits=None if rsp.context_logits is None else rsp.context_logits.tolist(), diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py index 179996cef3..fded3a9f14 100644 --- a/tests/unittest/llmapi/apps/_test_openai_completions.py +++ b/tests/unittest/llmapi/apps/_test_openai_completions.py @@ -312,3 +312,60 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI, temperature=0.0, stream=False, stream_options={"continuous_usage_stats": True}) + + +def test_detokenize_single(client: openai.OpenAI, model_name): + completion = client.completions.create( + model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0, + extra_body=dict(detokenize=False), + ) + + choice = completion.choices[0] + assert choice.text == "" + assert isinstance(choice.token_ids, list) + assert len(choice.token_ids) > 0 + + # test using token IDs + completion = client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + extra_body=dict(detokenize=True), + ) + + assert completion.choices[0].token_ids is None + + +@pytest.mark.asyncio(loop_scope="module") +async def test_completion_streaming(async_client: openai.AsyncOpenAI, + model_name: str): + prompt = "Hello, my name is" + + single_completion = await async_client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + extra_body=dict(detokenize=False), + ) + single_output = single_completion.choices[0].token_ids + stream = await async_client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + extra_body=dict(detokenize=False), + ) + tokens: List[int] = [] + + async for chunk in stream: + assert chunk.choices[0].text == "" + assert isinstance(chunk.choices[0].token_ids, list) + tokens.extend(chunk.choices[0].token_ids) + + assert tokens == single_output