From 52a31ccecca203effd490a5b496dc5f8d9496654 Mon Sep 17 00:00:00 2001 From: Ashwin Giridharan Date: Wed, 27 May 2026 05:39:49 -0700 Subject: [PATCH] [Bugfix] Map reasoning_effort to enable_thinking in chat template kwargs (#43401) Signed-off-by: Ashwin Giridharan Signed-off-by: Chauncey Co-authored-by: Chauncey --- docs/features/reasoning_outputs.md | 39 ++++++- .../openai/test_reasoning_enable_thinking.py | 107 ++++++++++++++++++ .../openai/chat_completion/protocol.py | 22 +++- vllm/entrypoints/openai/responses/protocol.py | 23 +++- 4 files changed, 178 insertions(+), 13 deletions(-) create mode 100644 tests/entrypoints/openai/test_reasoning_enable_thinking.py diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index f1cc18a25cb..92563a8b4bb 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models: | ------------ | ----------- | ---------------- | ----------- | | [Cohere Command A Reasoning](https://huggingface.co/CohereLabs/command-a-reasoning-08-2025) | `cohere_command3` | `json`, `regex` | ✅ | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [Gemma 4 series](https://huggingface.co/google/gemma-4-26B-A4B-it) | `gemma4` | `json`, `regex` | ✅ | | [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ | | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | @@ -29,6 +30,7 @@ vLLM currently supports the following reasoning models: !!! note IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. + Gemma 4 reasoning is disabled by default; to enable it, pass `enable_thinking=True` in your `chat_template_kwargs` or set `reasoning_effort` (which enables it automatically). DeepSeek-V3.1 tool calling is supported in non-thinking mode. Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`. @@ -314,9 +316,44 @@ for output in outputs: print("text:", output.outputs[0].text) ``` +## Automatic `enable_thinking` Activation + +Some models (such as Gemma 4, DeepSeek-V4-Pro and IBM Granite 3.2) require `enable_thinking: true` in their chat template kwargs to activate thinking mode — without it, reasoning tokens are never generated regardless of other settings. + +When you set `reasoning_effort` in a Chat Completions request (or `reasoning.effort` in a Responses API request), vLLM automatically injects `enable_thinking` into the chat template kwargs: + +- `reasoning_effort` = `"low"`, `"medium"`, or `"high"` → `enable_thinking = true` +- `reasoning_effort` = `"none"` → `enable_thinking = false` +- `reasoning_effort` not set → `enable_thinking` is not injected (preserves existing behavior) + +This means you no longer need to manually pass `chat_template_kwargs: {"enable_thinking": true}` when using `reasoning_effort` — it is handled automatically. + +!!! note + If you explicitly set `enable_thinking` in `chat_template_kwargs`, your value takes priority over the automatic injection. This allows you to override the behavior if needed. + + For models whose templates don't declare `enable_thinking` (e.g., DeepSeek R1), the injected kwarg is harmlessly filtered out by `resolve_chat_template_kwargs`. + +### Example + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") + +# reasoning_effort automatically enables thinking for models that need it +response = client.chat.completions.create( + model="google/gemma-4-26B-A4B-it", + messages=[{"role": "user", "content": "What is 15 * 37?"}], + reasoning_effort="high", # Automatically sets enable_thinking=true +) + +print(response.choices[0].message.reasoning) +print(response.choices[0].message.content) +``` + ## Limitations -- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`). +- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`), Anthropic Messages API (`/v1/messages`) and the Responses API (`/v1/responses`). ## How to support a new reasoning model diff --git a/tests/entrypoints/openai/test_reasoning_enable_thinking.py b/tests/entrypoints/openai/test_reasoning_enable_thinking.py new file mode 100644 index 00000000000..95606f626a0 --- /dev/null +++ b/tests/entrypoints/openai/test_reasoning_enable_thinking.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for reasoning_effort -> enable_thinking mapping. + +Models like Gemma4 require enable_thinking=True in chat_template_kwargs to +activate thinking mode. This mapping ensures that when a user requests +reasoning (via reasoning_effort or reasoning.effort), the template kwarg +is injected automatically. +""" + +import pytest +from openai.types.shared import Reasoning + +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest + + +def _build_chat_request(**kwargs) -> ChatCompletionRequest: + defaults = dict( + model="test-model", + messages=[{"role": "user", "content": "Hello"}], + ) + defaults.update(kwargs) + return ChatCompletionRequest(**defaults) + + +def _build_responses_request(**kwargs) -> ResponsesRequest: + defaults = dict( + model="test-model", + input=[{"role": "user", "content": "Hello"}], + ) + defaults.update(kwargs) + return ResponsesRequest(**defaults) + + +class TestChatCompletionReasoningEffort: + """Chat Completions: reasoning_effort -> enable_thinking.""" + + @pytest.mark.parametrize("effort", ["low", "medium", "high"]) + def test_non_none_effort_injects_enable_thinking_true(self, effort): + request = _build_chat_request(reasoning_effort=effort) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is True + + def test_none_effort_injects_enable_thinking_false(self): + request = _build_chat_request(reasoning_effort="none") + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is False + + def test_no_effort_does_not_inject(self): + request = _build_chat_request() + params = request.build_chat_params(None, "auto") + assert "enable_thinking" not in params.chat_template_kwargs + + def test_explicit_user_kwarg_not_overridden(self): + request = _build_chat_request( + reasoning_effort="high", + chat_template_kwargs={"enable_thinking": False}, + ) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is False + + def test_reasoning_effort_still_in_kwargs(self): + request = _build_chat_request(reasoning_effort="high") + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["reasoning_effort"] == "high" + + +class TestResponsesReasoningEffort: + """Responses API: reasoning.effort -> enable_thinking.""" + + @pytest.mark.parametrize("effort", ["low", "medium", "high"]) + def test_non_none_effort_injects_enable_thinking_true(self, effort): + request = _build_responses_request( + reasoning=Reasoning(effort=effort), + ) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is True + + def test_none_effort_injects_enable_thinking_false(self): + request = _build_responses_request( + reasoning=Reasoning(effort="none"), + ) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is False + + def test_no_reasoning_does_not_inject(self): + request = _build_responses_request() + params = request.build_chat_params(None, "auto") + assert "enable_thinking" not in params.chat_template_kwargs + + def test_explicit_user_kwarg_not_overridden(self): + request = _build_responses_request( + reasoning=Reasoning(effort="high"), + chat_template_kwargs={"enable_thinking": False}, + ) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["enable_thinking"] is False + + def test_reasoning_effort_still_in_kwargs(self): + request = _build_responses_request( + reasoning=Reasoning(effort="high"), + ) + params = request.build_chat_params(None, "auto") + assert params.chat_template_kwargs["reasoning_effort"] == "high" diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index cada289e109..73ecb3f35a1 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -473,17 +473,27 @@ class ChatCompletionRequest(OpenAIBaseModel): default_template: str | None, default_template_content_format: ChatTemplateContentFormatOption, ) -> ChatParams: + extra_kwargs: dict[str, Any] = dict( + add_generation_prompt=self.add_generation_prompt, + continue_final_message=self.continue_final_message, + documents=self.documents, + reasoning_effort=self.reasoning_effort, + ) + + # When reasoning is requested, activate thinking for models whose + # chat templates require explicit opt-in (e.g., Gemma4 defaults + # enable_thinking to false). For templates that don't declare the + # variable, resolve_chat_template_kwargs filters it out harmlessly. + user_kwargs = self.chat_template_kwargs or {} + if self.reasoning_effort is not None and "enable_thinking" not in user_kwargs: + extra_kwargs["enable_thinking"] = self.reasoning_effort != "none" + return ChatParams( chat_template=self.chat_template or default_template, chat_template_content_format=default_template_content_format, chat_template_kwargs=merge_kwargs( self.chat_template_kwargs, - dict( - add_generation_prompt=self.add_generation_prompt, - continue_final_message=self.continue_final_message, - documents=self.documents, - reasoning_effort=self.reasoning_effort, - ), + extra_kwargs, ), media_io_kwargs=self.media_io_kwargs, ) diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index 10aa5bde392..370ed9a825d 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -298,17 +298,28 @@ class ResponsesRequest(OpenAIBaseModel): continue_final = should_continue_final_message(self.input) reasoning = self.reasoning + reasoning_effort = None if reasoning is None else reasoning.effort + + extra_kwargs: dict[str, Any] = dict( + add_generation_prompt=not continue_final, + continue_final_message=continue_final, + reasoning_effort=reasoning_effort, + ) + + # When reasoning is requested, activate thinking for models whose + # chat templates require explicit opt-in (e.g., Gemma4 defaults + # enable_thinking to false). For templates that don't declare the + # variable, resolve_chat_template_kwargs filters it out harmlessly. + user_kwargs = self.chat_template_kwargs or {} + if reasoning_effort is not None and "enable_thinking" not in user_kwargs: + extra_kwargs["enable_thinking"] = reasoning_effort != "none" return ChatParams( chat_template=default_template, chat_template_content_format=default_template_content_format, - chat_template_kwargs=merge_kwargs( # To remove unset values + chat_template_kwargs=merge_kwargs( self.chat_template_kwargs, - dict( - add_generation_prompt=not continue_final, - continue_final_message=continue_final, - reasoning_effort=None if reasoning is None else reasoning.effort, - ), + extra_kwargs, ), media_io_kwargs=self.media_io_kwargs, )