diff --git a/tests/reasoning/test_olmo3_reasoning_parser.py b/tests/reasoning/test_olmo3_reasoning_parser.py
index bc0e72e2a45..a74ca50d11a 100644
--- a/tests/reasoning/test_olmo3_reasoning_parser.py
+++ b/tests/reasoning/test_olmo3_reasoning_parser.py
@@ -41,6 +41,12 @@ SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = {
"content": "\n\n\nThis is the rest",
}
+SIMPLE_REASONING_WITH_TRAILING_SPACE = {
+ "output": f"{START_REASONING}\nLook!\nI'm thinking... {END_REASONING}\nThis is the rest", # noqa: E501
+ "reasoning": "\nLook!\nI'm thinking... ",
+ "content": "\nThis is the rest",
+}
+
NO_REASONING_ONLY_END_THINK = {
"output": f"{END_REASONING}\n\nNo thoughts, head empty!",
"reasoning": None,
@@ -114,6 +120,11 @@ TEST_CASES = [
SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES,
id="simple_reasoning_with_multiple_newlines_streaming",
),
+ pytest.param(
+ True, # enable streaming
+ SIMPLE_REASONING_WITH_TRAILING_SPACE,
+ id="simple_reasoning_with_trailing_space_streaming",
+ ),
pytest.param(
True, # enable streaming
NO_REASONING_ONLY_END_THINK,
@@ -127,7 +138,7 @@ TEST_CASES = [
]
# Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("allenai/dolma2-tokenizer")
+tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Think")
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index b685aa23185..102508b9ac1 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -218,24 +218,32 @@ class Olmo3ReasoningParser(ReasoningParser):
token is missing from generation.
"""
+ think_start: str = r""
+ think_end: str = r""
+ # is split in 3 by the pre-tokenizer, first split can be tokenized
+ # with an optional leading space, so there are 2 possible tokenizations
+ think_end_first_split: list[str] = [r"Ġ", r""]
+ think_end_rest_split: list[str] = [r"think", r">"]
+ # notice that the first think is optional; this allows template to
+ # work in cases when we hardcode a at the beginning of the
+ # reasoning template.
+ reasoning_regex: re.Pattern = re.compile(
+ rf"^(?:{think_start})?(?P.*?)"
+ rf"{think_end}(?P.*)$",
+ re.DOTALL,
+ )
+
def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
-
- self.think_start = r""
- self.think_end = r""
-
- # notice that the first think is optional; this allows template to
- # work in cases when we hardcode a at the beginning of the
- # reasoning template.
- reasoning_expr = (
- rf"^(?:{self.think_start})?(?P.*?)"
- rf"{self.think_end}(?P.*)$"
- )
- self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)
-
self.buffer = Olmo3ReasoningBuffer(
think_start=self.think_start, think_end=self.think_end
)
+ self.think_end_first_token_ids: list[int] = [
+ self.vocab[token] for token in self.think_end_first_split
+ ]
+ self.think_end_rest_token_ids: list[int] = [
+ self.vocab[token] for token in self.think_end_rest_split
+ ]
@property
def reasoning_start_str(self) -> str:
@@ -246,8 +254,15 @@ class Olmo3ReasoningParser(ReasoningParser):
return self.think_end
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
- text = self.model_tokenizer.decode(input_ids)
- return self.think_end in text
+ rest_ids = self.think_end_rest_token_ids
+ rest_len = len(rest_ids)
+ for i in range(len(input_ids) - rest_len, -1, -1):
+ if (
+ list(input_ids[i + 1 : i + 1 + rest_len]) == rest_ids
+ and input_ids[i] in self.think_end_first_token_ids
+ ):
+ return True
+ return False
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# for Olmo 3 streaming reason parsing, the stream parse