[feat] Detokenize option in /v1/completions request (#5382)

Signed-off-by: Yegor <75512761+Wokzy@users.noreply.github.com>
Signed-off-by: Yegor Yershov <yegor6741@gmail.com>
This commit is contained in:
Yegor 2025-07-08 18:36:04 +07:00 committed by Zhihan Jiang
parent 8836990bde
commit 5b7007a69d
3 changed files with 67 additions and 2 deletions

View File

@ -83,6 +83,7 @@ class CompletionLogProbs(OpenAIBaseModel):
class CompletionResponseChoice(OpenAIBaseModel):
index: int
text: str
token_ids: Optional[List[int]] = None
logprobs: Optional[CompletionLogProbs] = None
context_logits: Optional[Union[List[float], List[List[
float]]]] = None # For reward models, the output is score logits instead of text.
@ -112,6 +113,7 @@ class CompletionResponse(OpenAIBaseModel):
class CompletionResponseStreamChoice(OpenAIBaseModel):
index: int
text: str
token_ids: Optional[List[int]] = None
logprobs: Optional[CompletionLogProbs] = None
finish_reason: Optional[str] = None
stop_reason: Optional[Union[int, str]] = Field(
@ -187,6 +189,7 @@ class CompletionRequest(OpenAIBaseModel):
spaces_between_special_tokens: bool = True
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
return_context_logits: bool = False
detokenize: bool = True
# doc: end-completion-sampling-params
# doc: begin-completion-extra-params
@ -241,6 +244,7 @@ class CompletionRequest(OpenAIBaseModel):
return_context_logits=self.return_context_logits,
guided_decoding=_response_format_to_guided_decoding_params(
self.response_format),
detokenize=self.detokenize,
# completion-extra-params
add_special_tokens=self.add_special_tokens,

View File

@ -257,6 +257,7 @@ class CompletionPostprocArgs(PostprocArgs):
model: str = None
num_choices: int = 1
prompt_idx: int = 0
detokenize: bool = True
prompt: Optional[str] = None
stream_options: Optional[StreamOptions] = None
@ -267,6 +268,7 @@ class CompletionPostprocArgs(PostprocArgs):
model=request.model,
num_choices=request.n if request.n else 1,
stream_options=request.stream_options,
detokenize=request.detokenize,
)
@ -287,7 +289,8 @@ def completion_stream_post_processor(rsp: DetokenizedGenerationResultBase, args:
delta_text = args.prompt + delta_text
choice = CompletionResponseStreamChoice(
index=args.prompt_idx * args.num_choices + output.index,
text=delta_text,
text=delta_text if args.detokenize else "",
token_ids=None if args.detokenize else output.token_ids_diff,
finish_reason = output.finish_reason,
stop_reason = output.stop_reason,
)
@ -327,7 +330,8 @@ def completion_response_post_processor(rsp: GenerationResult, args: CompletionPo
text = args.prompt + text
disaggregated_params = to_disaggregated_params(output.disaggregated_params)
choice = CompletionResponseChoice(
text=text,
text=text if args.detokenize else "",
token_ids=None if args.detokenize else output.token_ids,
index=args.prompt_idx * args.num_choices + output.index,
disaggregated_params=disaggregated_params,
context_logits=None if rsp.context_logits is None else rsp.context_logits.tolist(),

View File

@ -312,3 +312,60 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": True})
def test_detokenize_single(client: openai.OpenAI, model_name):
completion = client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=False),
)
choice = completion.choices[0]
assert choice.text == ""
assert isinstance(choice.token_ids, list)
assert len(choice.token_ids) > 0
# test using token IDs
completion = client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=True),
)
assert completion.choices[0].token_ids is None
@pytest.mark.asyncio(loop_scope="module")
async def test_completion_streaming(async_client: openai.AsyncOpenAI,
model_name: str):
prompt = "Hello, my name is"
single_completion = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
extra_body=dict(detokenize=False),
)
single_output = single_completion.choices[0].token_ids
stream = await async_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
extra_body=dict(detokenize=False),
)
tokens: List[int] = []
async for chunk in stream:
assert chunk.choices[0].text == ""
assert isinstance(chunk.choices[0].token_ids, list)
tokens.extend(chunk.choices[0].token_ids)
assert tokens == single_output