[Bugfix] KimiK2ReasoningParser: guard against buffered end-token in streaming (#41068)

Signed-off-by: Keyi Li <likey6688@gmail.com>
Co-authored-by: Keyi Li <likey6688@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Flora Feng <4florafeng@gmail.com>
This commit is contained in:
Keyi Li
2026-05-04 10:42:05 -07:00
committed by GitHub
parent 321fa2d6d1
commit 712ad0286c
2 changed files with 70 additions and 0 deletions
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock
import pytest
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
@@ -12,6 +14,20 @@ from vllm.tokenizers import get_tokenizer
REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
@pytest.fixture
def mock_kimi_k2_tokenizer():
tokenizer = MagicMock()
tokenizer.get_vocab.return_value = {
"<think>": 100,
"</think>": 101,
"<|tool_calls_section_begin|>": 200,
"<|tool_calls_section_end|>": 201,
"<|tool_call_begin|>": 202,
"<|tool_call_end|>": 203,
}
return tokenizer
@pytest.fixture(scope="module")
def kimi_k2_tokenizer():
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
)
assert isinstance(result, DeltaMessage)
assert result.content == "<|tool_calls_section_begin|>"
def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
"""When stop sequences buffer text, </think> ID arrives before its text.
The token ID is present in delta_token_ids but the actual string is not
yet in delta_text (still buffered). The parser must return None to wait
for the next delta, instead of calling find() which returns -1 and
silently corrupting the text split.
"""
parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
think_id = parser._start_token_id
end_think_id = parser._end_token_id
# Simulate: </think> ID arrived but text not yet flushed.
# Two token IDs in delta to bypass the single-special-token guard.
result = parser.extract_reasoning_streaming(
previous_text="some reasoning",
current_text="some reasoning extra",
delta_text="extra", # </think> text not yet flushed
previous_token_ids=[think_id],
current_token_ids=[think_id, end_think_id, 999],
delta_token_ids=[end_think_id, 999],
)
assert result is None
def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
"""When stop sequences buffer text, tool section start ID arrives before its text.
Same buffering scenario as above but for <|tool_calls_section_begin|>.
Without the guard, find() returns -1 and delta_text[:tool_index] silently
drops the last character of reasoning.
"""
parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
think_id = parser._start_token_id
tool_begin_id = parser._tool_section_start_token_id
result = parser.extract_reasoning_streaming(
previous_text="some reasoning",
current_text="some reasoning extra",
delta_text="extra", # tool section text not yet flushed
previous_token_ids=[think_id],
current_token_ids=[think_id, tool_begin_id, 999],
delta_token_ids=[tool_begin_id, 999],
)
assert result is None
@@ -221,6 +221,10 @@ class KimiK2ReasoningParser(ReasoningParser):
return None
if self._end_token_id in delta_token_ids:
if self._end_token not in delta_text:
# Token ID arrived before text was flushed (stop-sequence buffering).
# Wait for the next delta when the text becomes visible.
return None
end_index = delta_text.find(self._end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self._end_token) :]
@@ -229,6 +233,9 @@ class KimiK2ReasoningParser(ReasoningParser):
)
if self._tool_section_start_token_id in delta_token_ids:
if self._tool_section_start_token not in delta_text:
# Token ID arrived before text was flushed (stop-sequence buffering).
return None
tool_index = delta_text.find(self._tool_section_start_token)
reasoning = delta_text[:tool_index]
content = delta_text[tool_index:]