mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Bugfix] Map reasoning_effort to enable_thinking in chat template kwargs (#43401)
Signed-off-by: Ashwin Giridharan <girida@amazon.com> Signed-off-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
committed by
GitHub
parent
2272062471
commit
52a31ccecc
@@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models:
|
||||
| ------------ | ----------- | ---------------- | ----------- |
|
||||
| [Cohere Command A Reasoning](https://huggingface.co/CohereLabs/command-a-reasoning-08-2025) | `cohere_command3` | `json`, `regex` | ✅ |
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [Gemma 4 series](https://huggingface.co/google/gemma-4-26B-A4B-it) | `gemma4` | `json`, `regex` | ✅ |
|
||||
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
@@ -29,6 +30,7 @@ vLLM currently supports the following reasoning models:
|
||||
!!! note
|
||||
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
||||
Gemma 4 reasoning is disabled by default; to enable it, pass `enable_thinking=True` in your `chat_template_kwargs` or set `reasoning_effort` (which enables it automatically).
|
||||
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
|
||||
Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
|
||||
|
||||
@@ -314,9 +316,44 @@ for output in outputs:
|
||||
print("text:", output.outputs[0].text)
|
||||
```
|
||||
|
||||
## Automatic `enable_thinking` Activation
|
||||
|
||||
Some models (such as Gemma 4, DeepSeek-V4-Pro and IBM Granite 3.2) require `enable_thinking: true` in their chat template kwargs to activate thinking mode — without it, reasoning tokens are never generated regardless of other settings.
|
||||
|
||||
When you set `reasoning_effort` in a Chat Completions request (or `reasoning.effort` in a Responses API request), vLLM automatically injects `enable_thinking` into the chat template kwargs:
|
||||
|
||||
- `reasoning_effort` = `"low"`, `"medium"`, or `"high"` → `enable_thinking = true`
|
||||
- `reasoning_effort` = `"none"` → `enable_thinking = false`
|
||||
- `reasoning_effort` not set → `enable_thinking` is not injected (preserves existing behavior)
|
||||
|
||||
This means you no longer need to manually pass `chat_template_kwargs: {"enable_thinking": true}` when using `reasoning_effort` — it is handled automatically.
|
||||
|
||||
!!! note
|
||||
If you explicitly set `enable_thinking` in `chat_template_kwargs`, your value takes priority over the automatic injection. This allows you to override the behavior if needed.
|
||||
|
||||
For models whose templates don't declare `enable_thinking` (e.g., DeepSeek R1), the injected kwarg is harmlessly filtered out by `resolve_chat_template_kwargs`.
|
||||
|
||||
### Example
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
# reasoning_effort automatically enables thinking for models that need it
|
||||
response = client.chat.completions.create(
|
||||
model="google/gemma-4-26B-A4B-it",
|
||||
messages=[{"role": "user", "content": "What is 15 * 37?"}],
|
||||
reasoning_effort="high", # Automatically sets enable_thinking=true
|
||||
)
|
||||
|
||||
print(response.choices[0].message.reasoning)
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
|
||||
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`), Anthropic Messages API (`/v1/messages`) and the Responses API (`/v1/responses`).
|
||||
|
||||
## How to support a new reasoning model
|
||||
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Unit tests for reasoning_effort -> enable_thinking mapping.
|
||||
|
||||
Models like Gemma4 require enable_thinking=True in chat_template_kwargs to
|
||||
activate thinking mode. This mapping ensures that when a user requests
|
||||
reasoning (via reasoning_effort or reasoning.effort), the template kwarg
|
||||
is injected automatically.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from openai.types.shared import Reasoning
|
||||
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
)
|
||||
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
|
||||
|
||||
|
||||
def _build_chat_request(**kwargs) -> ChatCompletionRequest:
|
||||
defaults = dict(
|
||||
model="test-model",
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
)
|
||||
defaults.update(kwargs)
|
||||
return ChatCompletionRequest(**defaults)
|
||||
|
||||
|
||||
def _build_responses_request(**kwargs) -> ResponsesRequest:
|
||||
defaults = dict(
|
||||
model="test-model",
|
||||
input=[{"role": "user", "content": "Hello"}],
|
||||
)
|
||||
defaults.update(kwargs)
|
||||
return ResponsesRequest(**defaults)
|
||||
|
||||
|
||||
class TestChatCompletionReasoningEffort:
|
||||
"""Chat Completions: reasoning_effort -> enable_thinking."""
|
||||
|
||||
@pytest.mark.parametrize("effort", ["low", "medium", "high"])
|
||||
def test_non_none_effort_injects_enable_thinking_true(self, effort):
|
||||
request = _build_chat_request(reasoning_effort=effort)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is True
|
||||
|
||||
def test_none_effort_injects_enable_thinking_false(self):
|
||||
request = _build_chat_request(reasoning_effort="none")
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is False
|
||||
|
||||
def test_no_effort_does_not_inject(self):
|
||||
request = _build_chat_request()
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert "enable_thinking" not in params.chat_template_kwargs
|
||||
|
||||
def test_explicit_user_kwarg_not_overridden(self):
|
||||
request = _build_chat_request(
|
||||
reasoning_effort="high",
|
||||
chat_template_kwargs={"enable_thinking": False},
|
||||
)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is False
|
||||
|
||||
def test_reasoning_effort_still_in_kwargs(self):
|
||||
request = _build_chat_request(reasoning_effort="high")
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["reasoning_effort"] == "high"
|
||||
|
||||
|
||||
class TestResponsesReasoningEffort:
|
||||
"""Responses API: reasoning.effort -> enable_thinking."""
|
||||
|
||||
@pytest.mark.parametrize("effort", ["low", "medium", "high"])
|
||||
def test_non_none_effort_injects_enable_thinking_true(self, effort):
|
||||
request = _build_responses_request(
|
||||
reasoning=Reasoning(effort=effort),
|
||||
)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is True
|
||||
|
||||
def test_none_effort_injects_enable_thinking_false(self):
|
||||
request = _build_responses_request(
|
||||
reasoning=Reasoning(effort="none"),
|
||||
)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is False
|
||||
|
||||
def test_no_reasoning_does_not_inject(self):
|
||||
request = _build_responses_request()
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert "enable_thinking" not in params.chat_template_kwargs
|
||||
|
||||
def test_explicit_user_kwarg_not_overridden(self):
|
||||
request = _build_responses_request(
|
||||
reasoning=Reasoning(effort="high"),
|
||||
chat_template_kwargs={"enable_thinking": False},
|
||||
)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["enable_thinking"] is False
|
||||
|
||||
def test_reasoning_effort_still_in_kwargs(self):
|
||||
request = _build_responses_request(
|
||||
reasoning=Reasoning(effort="high"),
|
||||
)
|
||||
params = request.build_chat_params(None, "auto")
|
||||
assert params.chat_template_kwargs["reasoning_effort"] == "high"
|
||||
@@ -473,17 +473,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
default_template: str | None,
|
||||
default_template_content_format: ChatTemplateContentFormatOption,
|
||||
) -> ChatParams:
|
||||
extra_kwargs: dict[str, Any] = dict(
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
continue_final_message=self.continue_final_message,
|
||||
documents=self.documents,
|
||||
reasoning_effort=self.reasoning_effort,
|
||||
)
|
||||
|
||||
# When reasoning is requested, activate thinking for models whose
|
||||
# chat templates require explicit opt-in (e.g., Gemma4 defaults
|
||||
# enable_thinking to false). For templates that don't declare the
|
||||
# variable, resolve_chat_template_kwargs filters it out harmlessly.
|
||||
user_kwargs = self.chat_template_kwargs or {}
|
||||
if self.reasoning_effort is not None and "enable_thinking" not in user_kwargs:
|
||||
extra_kwargs["enable_thinking"] = self.reasoning_effort != "none"
|
||||
|
||||
return ChatParams(
|
||||
chat_template=self.chat_template or default_template,
|
||||
chat_template_content_format=default_template_content_format,
|
||||
chat_template_kwargs=merge_kwargs(
|
||||
self.chat_template_kwargs,
|
||||
dict(
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
continue_final_message=self.continue_final_message,
|
||||
documents=self.documents,
|
||||
reasoning_effort=self.reasoning_effort,
|
||||
),
|
||||
extra_kwargs,
|
||||
),
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
)
|
||||
|
||||
@@ -298,17 +298,28 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
continue_final = should_continue_final_message(self.input)
|
||||
|
||||
reasoning = self.reasoning
|
||||
reasoning_effort = None if reasoning is None else reasoning.effort
|
||||
|
||||
extra_kwargs: dict[str, Any] = dict(
|
||||
add_generation_prompt=not continue_final,
|
||||
continue_final_message=continue_final,
|
||||
reasoning_effort=reasoning_effort,
|
||||
)
|
||||
|
||||
# When reasoning is requested, activate thinking for models whose
|
||||
# chat templates require explicit opt-in (e.g., Gemma4 defaults
|
||||
# enable_thinking to false). For templates that don't declare the
|
||||
# variable, resolve_chat_template_kwargs filters it out harmlessly.
|
||||
user_kwargs = self.chat_template_kwargs or {}
|
||||
if reasoning_effort is not None and "enable_thinking" not in user_kwargs:
|
||||
extra_kwargs["enable_thinking"] = reasoning_effort != "none"
|
||||
|
||||
return ChatParams(
|
||||
chat_template=default_template,
|
||||
chat_template_content_format=default_template_content_format,
|
||||
chat_template_kwargs=merge_kwargs( # To remove unset values
|
||||
chat_template_kwargs=merge_kwargs(
|
||||
self.chat_template_kwargs,
|
||||
dict(
|
||||
add_generation_prompt=not continue_final,
|
||||
continue_final_message=continue_final,
|
||||
reasoning_effort=None if reasoning is None else reasoning.effort,
|
||||
),
|
||||
extra_kwargs,
|
||||
),
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user