[Bugfix] KimiK2ReasoningParser: guard against buffered end-token in streaming (#41068)

Signed-off-by: Keyi Li <likey6688@gmail.com> Co-authored-by: Keyi Li <likey6688@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Flora Feng <4florafeng@gmail.com>
2026-06-06 00:16:14 +00:00 · 2026-05-04 10:42:05 -07:00
parent 321fa2d6d1
commit 712ad0286c
2 changed files with 70 additions and 0 deletions
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from unittest.mock import MagicMock
+
 import pytest

 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
@@ -12,6 +14,20 @@ from vllm.tokenizers import get_tokenizer
 REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"


+@pytest.fixture
+def mock_kimi_k2_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.get_vocab.return_value = {
+        "<think>": 100,
+        "</think>": 101,
+        "<|tool_calls_section_begin|>": 200,
+        "<|tool_calls_section_end|>": 201,
+        "<|tool_call_begin|>": 202,
+        "<|tool_call_end|>": 203,
+    }
+    return tokenizer
+
+
@pytest.fixture(scope="module")
 def kimi_k2_tokenizer():
    return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
    )
    assert isinstance(result, DeltaMessage)
    assert result.content == "<|tool_calls_section_begin|>"
+
+
+def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, </think> ID arrives before its text.
+
+    The token ID is present in delta_token_ids but the actual string is not
+    yet in delta_text (still buffered). The parser must return None to wait
+    for the next delta, instead of calling find() which returns -1 and
+    silently corrupting the text split.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    end_think_id = parser._end_token_id
+
+    # Simulate: </think> ID arrived but text not yet flushed.
+    # Two token IDs in delta to bypass the single-special-token guard.
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # </think> text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, end_think_id, 999],
+        delta_token_ids=[end_think_id, 999],
+    )
+    assert result is None
+
+
+def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, tool section start ID arrives before its text.
+
+    Same buffering scenario as above but for <|tool_calls_section_begin|>.
+    Without the guard, find() returns -1 and delta_text[:tool_index] silently
+    drops the last character of reasoning.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    tool_begin_id = parser._tool_section_start_token_id
+
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # tool section text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, tool_begin_id, 999],
+        delta_token_ids=[tool_begin_id, 999],
+    )
+    assert result is None
@@ -221,6 +221,10 @@ class KimiK2ReasoningParser(ReasoningParser):
            return None

        if self._end_token_id in delta_token_ids:
+            if self._end_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                # Wait for the next delta when the text becomes visible.
+                return None
            end_index = delta_text.find(self._end_token)
            reasoning = delta_text[:end_index]
            content = delta_text[end_index + len(self._end_token) :]
@@ -229,6 +233,9 @@ class KimiK2ReasoningParser(ReasoningParser):
            )

        if self._tool_section_start_token_id in delta_token_ids:
+            if self._tool_section_start_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                return None
            tool_index = delta_text.find(self._tool_section_start_token)
            reasoning = delta_text[:tool_index]
            content = delta_text[tool_index:]