mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Bugfix] KimiK2ReasoningParser: guard against buffered end-token in streaming (#41068)
Signed-off-by: Keyi Li <likey6688@gmail.com> Co-authored-by: Keyi Li <likey6688@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Flora Feng <4florafeng@gmail.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
@@ -12,6 +14,20 @@ from vllm.tokenizers import get_tokenizer
|
||||
REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_kimi_k2_tokenizer():
|
||||
tokenizer = MagicMock()
|
||||
tokenizer.get_vocab.return_value = {
|
||||
"<think>": 100,
|
||||
"</think>": 101,
|
||||
"<|tool_calls_section_begin|>": 200,
|
||||
"<|tool_calls_section_end|>": 201,
|
||||
"<|tool_call_begin|>": 202,
|
||||
"<|tool_call_end|>": 203,
|
||||
}
|
||||
return tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def kimi_k2_tokenizer():
|
||||
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
|
||||
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
|
||||
)
|
||||
assert isinstance(result, DeltaMessage)
|
||||
assert result.content == "<|tool_calls_section_begin|>"
|
||||
|
||||
|
||||
def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
|
||||
"""When stop sequences buffer text, </think> ID arrives before its text.
|
||||
|
||||
The token ID is present in delta_token_ids but the actual string is not
|
||||
yet in delta_text (still buffered). The parser must return None to wait
|
||||
for the next delta, instead of calling find() which returns -1 and
|
||||
silently corrupting the text split.
|
||||
"""
|
||||
parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
|
||||
think_id = parser._start_token_id
|
||||
end_think_id = parser._end_token_id
|
||||
|
||||
# Simulate: </think> ID arrived but text not yet flushed.
|
||||
# Two token IDs in delta to bypass the single-special-token guard.
|
||||
result = parser.extract_reasoning_streaming(
|
||||
previous_text="some reasoning",
|
||||
current_text="some reasoning extra",
|
||||
delta_text="extra", # </think> text not yet flushed
|
||||
previous_token_ids=[think_id],
|
||||
current_token_ids=[think_id, end_think_id, 999],
|
||||
delta_token_ids=[end_think_id, 999],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
|
||||
"""When stop sequences buffer text, tool section start ID arrives before its text.
|
||||
|
||||
Same buffering scenario as above but for <|tool_calls_section_begin|>.
|
||||
Without the guard, find() returns -1 and delta_text[:tool_index] silently
|
||||
drops the last character of reasoning.
|
||||
"""
|
||||
parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
|
||||
think_id = parser._start_token_id
|
||||
tool_begin_id = parser._tool_section_start_token_id
|
||||
|
||||
result = parser.extract_reasoning_streaming(
|
||||
previous_text="some reasoning",
|
||||
current_text="some reasoning extra",
|
||||
delta_text="extra", # tool section text not yet flushed
|
||||
previous_token_ids=[think_id],
|
||||
current_token_ids=[think_id, tool_begin_id, 999],
|
||||
delta_token_ids=[tool_begin_id, 999],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
@@ -221,6 +221,10 @@ class KimiK2ReasoningParser(ReasoningParser):
|
||||
return None
|
||||
|
||||
if self._end_token_id in delta_token_ids:
|
||||
if self._end_token not in delta_text:
|
||||
# Token ID arrived before text was flushed (stop-sequence buffering).
|
||||
# Wait for the next delta when the text becomes visible.
|
||||
return None
|
||||
end_index = delta_text.find(self._end_token)
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self._end_token) :]
|
||||
@@ -229,6 +233,9 @@ class KimiK2ReasoningParser(ReasoningParser):
|
||||
)
|
||||
|
||||
if self._tool_section_start_token_id in delta_token_ids:
|
||||
if self._tool_section_start_token not in delta_text:
|
||||
# Token ID arrived before text was flushed (stop-sequence buffering).
|
||||
return None
|
||||
tool_index = delta_text.find(self._tool_section_start_token)
|
||||
reasoning = delta_text[:tool_index]
|
||||
content = delta_text[tool_index:]
|
||||
|
||||
Reference in New Issue
Block a user