[Bugfix] Map reasoning_effort to enable_thinking in chat template kwargs (#43401)

Signed-off-by: Ashwin Giridharan <girida@amazon.com>
Signed-off-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
Ashwin Giridharan
2026-05-27 05:39:49 -07:00
committed by GitHub
parent 2272062471
commit 52a31ccecc
4 changed files with 178 additions and 13 deletions
+38 -1
View File
@@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models:
| ------------ | ----------- | ---------------- | ----------- |
| [Cohere Command A Reasoning](https://huggingface.co/CohereLabs/command-a-reasoning-08-2025) | `cohere_command3` | `json`, `regex` | ✅ |
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [Gemma 4 series](https://huggingface.co/google/gemma-4-26B-A4B-it) | `gemma4` | `json`, `regex` | ✅ |
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
@@ -29,6 +30,7 @@ vLLM currently supports the following reasoning models:
!!! note
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
Gemma 4 reasoning is disabled by default; to enable it, pass `enable_thinking=True` in your `chat_template_kwargs` or set `reasoning_effort` (which enables it automatically).
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
@@ -314,9 +316,44 @@ for output in outputs:
print("text:", output.outputs[0].text)
```
## Automatic `enable_thinking` Activation
Some models (such as Gemma 4, DeepSeek-V4-Pro and IBM Granite 3.2) require `enable_thinking: true` in their chat template kwargs to activate thinking mode — without it, reasoning tokens are never generated regardless of other settings.
When you set `reasoning_effort` in a Chat Completions request (or `reasoning.effort` in a Responses API request), vLLM automatically injects `enable_thinking` into the chat template kwargs:
- `reasoning_effort` = `"low"`, `"medium"`, or `"high"` → `enable_thinking = true`
- `reasoning_effort` = `"none"` → `enable_thinking = false`
- `reasoning_effort` not set → `enable_thinking` is not injected (preserves existing behavior)
This means you no longer need to manually pass `chat_template_kwargs: {"enable_thinking": true}` when using `reasoning_effort` — it is handled automatically.
!!! note
If you explicitly set `enable_thinking` in `chat_template_kwargs`, your value takes priority over the automatic injection. This allows you to override the behavior if needed.
For models whose templates don't declare `enable_thinking` (e.g., DeepSeek R1), the injected kwarg is harmlessly filtered out by `resolve_chat_template_kwargs`.
### Example
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
# reasoning_effort automatically enables thinking for models that need it
response = client.chat.completions.create(
model="google/gemma-4-26B-A4B-it",
messages=[{"role": "user", "content": "What is 15 * 37?"}],
reasoning_effort="high", # Automatically sets enable_thinking=true
)
print(response.choices[0].message.reasoning)
print(response.choices[0].message.content)
```
## Limitations
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`), Anthropic Messages API (`/v1/messages`) and the Responses API (`/v1/responses`).
## How to support a new reasoning model
@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for reasoning_effort -> enable_thinking mapping.
Models like Gemma4 require enable_thinking=True in chat_template_kwargs to
activate thinking mode. This mapping ensures that when a user requests
reasoning (via reasoning_effort or reasoning.effort), the template kwarg
is injected automatically.
"""
import pytest
from openai.types.shared import Reasoning
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
def _build_chat_request(**kwargs) -> ChatCompletionRequest:
defaults = dict(
model="test-model",
messages=[{"role": "user", "content": "Hello"}],
)
defaults.update(kwargs)
return ChatCompletionRequest(**defaults)
def _build_responses_request(**kwargs) -> ResponsesRequest:
defaults = dict(
model="test-model",
input=[{"role": "user", "content": "Hello"}],
)
defaults.update(kwargs)
return ResponsesRequest(**defaults)
class TestChatCompletionReasoningEffort:
"""Chat Completions: reasoning_effort -> enable_thinking."""
@pytest.mark.parametrize("effort", ["low", "medium", "high"])
def test_non_none_effort_injects_enable_thinking_true(self, effort):
request = _build_chat_request(reasoning_effort=effort)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is True
def test_none_effort_injects_enable_thinking_false(self):
request = _build_chat_request(reasoning_effort="none")
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is False
def test_no_effort_does_not_inject(self):
request = _build_chat_request()
params = request.build_chat_params(None, "auto")
assert "enable_thinking" not in params.chat_template_kwargs
def test_explicit_user_kwarg_not_overridden(self):
request = _build_chat_request(
reasoning_effort="high",
chat_template_kwargs={"enable_thinking": False},
)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is False
def test_reasoning_effort_still_in_kwargs(self):
request = _build_chat_request(reasoning_effort="high")
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["reasoning_effort"] == "high"
class TestResponsesReasoningEffort:
"""Responses API: reasoning.effort -> enable_thinking."""
@pytest.mark.parametrize("effort", ["low", "medium", "high"])
def test_non_none_effort_injects_enable_thinking_true(self, effort):
request = _build_responses_request(
reasoning=Reasoning(effort=effort),
)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is True
def test_none_effort_injects_enable_thinking_false(self):
request = _build_responses_request(
reasoning=Reasoning(effort="none"),
)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is False
def test_no_reasoning_does_not_inject(self):
request = _build_responses_request()
params = request.build_chat_params(None, "auto")
assert "enable_thinking" not in params.chat_template_kwargs
def test_explicit_user_kwarg_not_overridden(self):
request = _build_responses_request(
reasoning=Reasoning(effort="high"),
chat_template_kwargs={"enable_thinking": False},
)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["enable_thinking"] is False
def test_reasoning_effort_still_in_kwargs(self):
request = _build_responses_request(
reasoning=Reasoning(effort="high"),
)
params = request.build_chat_params(None, "auto")
assert params.chat_template_kwargs["reasoning_effort"] == "high"
@@ -473,17 +473,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
default_template: str | None,
default_template_content_format: ChatTemplateContentFormatOption,
) -> ChatParams:
extra_kwargs: dict[str, Any] = dict(
add_generation_prompt=self.add_generation_prompt,
continue_final_message=self.continue_final_message,
documents=self.documents,
reasoning_effort=self.reasoning_effort,
)
# When reasoning is requested, activate thinking for models whose
# chat templates require explicit opt-in (e.g., Gemma4 defaults
# enable_thinking to false). For templates that don't declare the
# variable, resolve_chat_template_kwargs filters it out harmlessly.
user_kwargs = self.chat_template_kwargs or {}
if self.reasoning_effort is not None and "enable_thinking" not in user_kwargs:
extra_kwargs["enable_thinking"] = self.reasoning_effort != "none"
return ChatParams(
chat_template=self.chat_template or default_template,
chat_template_content_format=default_template_content_format,
chat_template_kwargs=merge_kwargs(
self.chat_template_kwargs,
dict(
add_generation_prompt=self.add_generation_prompt,
continue_final_message=self.continue_final_message,
documents=self.documents,
reasoning_effort=self.reasoning_effort,
),
extra_kwargs,
),
media_io_kwargs=self.media_io_kwargs,
)
+17 -6
View File
@@ -298,17 +298,28 @@ class ResponsesRequest(OpenAIBaseModel):
continue_final = should_continue_final_message(self.input)
reasoning = self.reasoning
reasoning_effort = None if reasoning is None else reasoning.effort
extra_kwargs: dict[str, Any] = dict(
add_generation_prompt=not continue_final,
continue_final_message=continue_final,
reasoning_effort=reasoning_effort,
)
# When reasoning is requested, activate thinking for models whose
# chat templates require explicit opt-in (e.g., Gemma4 defaults
# enable_thinking to false). For templates that don't declare the
# variable, resolve_chat_template_kwargs filters it out harmlessly.
user_kwargs = self.chat_template_kwargs or {}
if reasoning_effort is not None and "enable_thinking" not in user_kwargs:
extra_kwargs["enable_thinking"] = reasoning_effort != "none"
return ChatParams(
chat_template=default_template,
chat_template_content_format=default_template_content_format,
chat_template_kwargs=merge_kwargs( # To remove unset values
chat_template_kwargs=merge_kwargs(
self.chat_template_kwargs,
dict(
add_generation_prompt=not continue_final,
continue_final_message=continue_final,
reasoning_effort=None if reasoning is None else reasoning.effort,
),
extra_kwargs,
),
media_io_kwargs=self.media_io_kwargs,
)