[Bugfix] GPT-OSS instruction rendering (#44330)

Signed-off-by: Yifan Zong <yzong@redhat.com>
This commit is contained in:
yzong-rh
2026-06-05 13:52:32 -04:00
committed by GitHub
parent b593396c7a
commit 703fb17b13
9 changed files with 276 additions and 112 deletions
@@ -1364,6 +1364,57 @@ class TestServingChatWithHarmony:
],
)
@pytest.mark.asyncio
async def test_system_message_without_tools(self, serving_chat, stream):
"""Leading system message produces a developer message with
DeveloperContent (# Instructions header)."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello"},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
{"role": "system"},
{
"role": "developer",
"instructions": "You are a helpful assistant.",
},
{"role": "user", "content": "Hello"},
],
)
@pytest.mark.asyncio
async def test_system_message_with_tools(self, serving_chat, stream, weather_tools):
"""Leading system message is folded into the developer message
alongside tool definitions."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather?"},
]
req = ChatCompletionRequest(
model=MODEL_NAME, messages=messages, tools=weather_tools
)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
{"role": "system"},
{
"role": "developer",
"instructions": "You are a helpful assistant.",
"tool_definitions": ["get_weather"],
},
{"role": "user", "content": "What's the weather?"},
],
)
@pytest.mark.asyncio
async def test_tool_call_response_with_content(
self, serving_chat, stream, weather_tools, weather_messages_start
@@ -45,6 +45,31 @@ class TestResponseInputToHarmonyRenderParity:
# Single-message cases
# -----------------------------------------------------------------------
def test_developer_message(self):
"""Both APIs must render developer messages identically using
DeveloperContent (with the '# Instructions' header)."""
chat_msgs = parse_chat_input_to_harmony_message(
{"role": "developer", "content": "Be concise."}
)
resp_msgs = [
response_input_to_harmony(
{
"type": "message",
"role": "developer",
"content": "Be concise.",
},
prev_responses=[],
)
]
expected = [{"role": "developer", "instructions": "Be concise."}]
verify_harmony_messages(chat_msgs, expected)
verify_harmony_messages(resp_msgs, expected)
assert render_for_completion([_system()] + chat_msgs) == render_for_completion(
[_system()] + resp_msgs
)
def test_user_message(self):
chat_msgs = parse_chat_input_to_harmony_message(
{"role": "user", "content": "What's the weather in Paris?"}
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from openai_harmony import Message, Role
from openai_harmony import DeveloperContent, Message, Role
from tests.entrypoints.openai.utils import verify_harmony_messages
from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -269,7 +269,8 @@ class TestCommonParseInputToHarmonyMessage:
assert messages[0].recipient == "functions.get_current_time"
def test_system_message(self, parse_function):
"""Test parsing system message."""
"""Test parsing system messages, which are parsed into developer messages
with DeveloperContent."""
chat_msg = {
"role": "system",
"content": "You are a helpful assistant",
@@ -278,9 +279,9 @@ class TestCommonParseInputToHarmonyMessage:
messages = parse_function(chat_msg)
assert len(messages) == 1
# System messages are converted using Message.from_dict
# which should preserve the role
assert messages[0].author.role == Role.SYSTEM
assert messages[0].author.role == Role.DEVELOPER
assert isinstance(messages[0].content[0], DeveloperContent)
assert messages[0].content[0].instructions == "You are a helpful assistant"
def test_developer_message(self, parse_function):
"""Test parsing developer message."""
@@ -293,6 +294,8 @@ class TestCommonParseInputToHarmonyMessage:
assert len(messages) == 1
assert messages[0].author.role == Role.DEVELOPER
assert isinstance(messages[0].content[0], DeveloperContent)
assert messages[0].content[0].instructions == "Use concise language"
def test_user_message_with_string_content(self, parse_function):
"""Test parsing user message with string content."""
@@ -12,7 +12,7 @@ from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningIt
from openai.types.responses.response_reasoning_item import (
Content as ReasoningTextContent,
)
from openai_harmony import Role
from openai_harmony import DeveloperContent, Role
from vllm.entrypoints.openai.responses.harmony import response_input_to_harmony
@@ -65,14 +65,16 @@ class TestResponseInputToHarmonyMessage:
assert msg.content[0].text == "Hello"
def test_system_message(self):
"""System messages carry developer instructions and must be rendered
as developer messages with DeveloperContent."""
msg = response_input_to_harmony(
{"type": "message", "role": "system", "content": "Be helpful."},
prev_responses=[],
)
assert msg.author.role == Role.SYSTEM
assert msg.content[0].text == "Be helpful."
assert msg.channel is None
assert msg.author.role == Role.DEVELOPER
assert isinstance(msg.content[0], DeveloperContent)
assert msg.content[0].instructions == "Be helpful."
def test_assistant_message_gets_final_channel(self):
msg = response_input_to_harmony(
@@ -85,14 +87,16 @@ class TestResponseInputToHarmonyMessage:
assert msg.content[0].text == "The answer is 42."
def test_developer_message_gets_instructions_prefix(self):
"""Developer messages must use DeveloperContent which adds the
'# Instructions' header the model was trained on."""
msg = response_input_to_harmony(
{"type": "message", "role": "developer", "content": "Be concise."},
prev_responses=[],
)
assert msg.author.role == Role.DEVELOPER
assert msg.content[0].text == "Instructions:\nBe concise."
assert msg.channel is None
assert isinstance(msg.content[0], DeveloperContent)
assert msg.content[0].instructions == "Be concise."
def test_message_with_array_content(self):
msg = response_input_to_harmony(
@@ -112,7 +116,9 @@ class TestResponseInputToHarmonyMessage:
assert msg.content[0].text == "Part one. "
assert msg.content[1].text == "Part two."
def test_developer_message_array_content_gets_prefix_on_each_part(self):
def test_developer_message_array_content_concatenated(self):
"""Array content in developer messages is flattened and rendered
via DeveloperContent with the '# Instructions' header."""
msg = response_input_to_harmony(
{
"type": "message",
@@ -125,8 +131,9 @@ class TestResponseInputToHarmonyMessage:
prev_responses=[],
)
assert msg.content[0].text == "Instructions:\nRule 1."
assert msg.content[1].text == "Instructions:\nRule 2."
assert msg.author.role == Role.DEVELOPER
assert isinstance(msg.content[0], DeveloperContent)
assert msg.content[0].instructions == "Rule 1.Rule 2."
# -----------------------------------------------------------------------
# type="reasoning"
+2
View File
@@ -155,6 +155,8 @@ def verify_harmony_messages(
assert msg.content[0].text == expected["content"]
if "content_type" in expected:
assert msg.content_type == expected["content_type"]
if "instructions" in expected:
assert msg.content[0].instructions == expected["instructions"]
if "tool_definitions" in expected:
# Check that the tool definitions match the expected list of tool names
actual_tools = [t.name for t in msg.content[0].tools["functions"].tools]
+102 -13
View File
@@ -3,6 +3,7 @@
import datetime
from collections.abc import Iterable, Sequence
from typing import Any
from openai.types.responses.tool import Tool
from openai_harmony import (
@@ -195,6 +196,12 @@ def get_user_message(content: str) -> Message:
return Message.from_role_and_content(Role.USER, content)
def get_system_or_developer_message(role: str, instructions: str) -> Message:
if role == "system" and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
return get_system_message(instructions=instructions)
return get_developer_message(instructions=instructions)
def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
"""
Parse a list of messages from request.messages in the Chat Completion API to
@@ -249,18 +256,96 @@ def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]:
return cleaned_msgs
def flatten_chat_text_content(content: str | list | None) -> str | None:
def flatten_input_text_content(content: Any) -> str | None:
"""
Extract the text parts from a chat message content field and flatten them
into a single string.
Extract text parts from a Chat Completion or Responses API content field and
flatten them into a single string. Returns None if no text content is found.
"""
if isinstance(content, list):
return "".join(
item.get("text", "")
for item in content
if isinstance(item, dict) and item.get("type") == "text"
if content is None or isinstance(content, str):
return content
if not isinstance(content, list):
return None
texts: list[str] = []
for item in content:
if isinstance(item, str):
texts.append(item)
continue
if isinstance(item, dict):
text = item.get("text")
if text is not None:
texts.append(text)
return "".join(texts) if texts else None
def extract_instructions_from_messages(
messages: Sequence[Any],
) -> tuple[str | None, list[Any]]:
"""
Peel a leading system/developer Chat Completion or Responses message and
flatten its instruction text.
"""
remaining_messages = list(messages)
if not remaining_messages:
return None, remaining_messages
first_message = remaining_messages[0]
if not isinstance(first_message, dict):
if hasattr(first_message, "to_dict"):
# Handle OpenAI Harmony Message
first_message = first_message.to_dict()
elif hasattr(first_message, "model_dump"):
first_message = first_message.model_dump(exclude_none=True)
else:
raise ValueError(f"Unknown message type: {type(first_message)}")
if first_message.get("role") not in (
"system",
"developer",
):
return None, remaining_messages
instructions = flatten_input_text_content(first_message.get("content"))
return instructions, remaining_messages[1:]
def build_harmony_preamble(
*,
instructions: str | None = None,
tools: list[Tool | ChatCompletionToolsParam] | None = None,
reasoning_effort: str | None = None,
browser_description: str | None = None,
python_description: str | None = None,
container_description: str | None = None,
with_custom_tools: bool = False,
) -> list[Message]:
"""
Build the standard Harmony system/developer prefix for a request.
"""
developer_instructions = system_instructions = None
if envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
system_instructions = instructions
else:
developer_instructions = instructions
messages = [
get_system_message(
reasoning_effort=reasoning_effort,
browser_description=browser_description,
python_description=python_description,
container_description=container_description,
instructions=system_instructions,
with_custom_tools=with_custom_tools,
)
return content
]
if developer_instructions or tools:
messages.append(
get_developer_message(
instructions=developer_instructions,
tools=tools,
)
)
return messages
def parse_chat_input_to_harmony_message(
@@ -283,7 +368,7 @@ def parse_chat_input_to_harmony_message(
tool_calls = chat_msg.get("tool_calls", [])
if role == "assistant" and tool_calls:
content = flatten_chat_text_content(chat_msg.get("content"))
content = flatten_input_text_content(chat_msg.get("content"))
if content:
commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content)
commentary_msg = commentary_msg.with_channel("commentary")
@@ -313,8 +398,7 @@ def parse_chat_input_to_harmony_message(
if role == "tool":
tool_call_id = chat_msg.get("tool_call_id", "")
name = tool_id_names.get(tool_call_id, "")
content = chat_msg.get("content", "") or ""
content = flatten_chat_text_content(content)
content = flatten_input_text_content(chat_msg.get("content")) or ""
msg = (
Message.from_author_and_content(
@@ -349,7 +433,12 @@ def parse_chat_input_to_harmony_message(
# Send non-tool assistant messages to the final channel
msg = msg.with_channel("final")
msgs.append(msg)
# For user/system/developer messages, add them directly even if no content.
elif role in ("system", "developer"):
instructions = flatten_input_text_content(chat_msg.get("content"))
if instructions is not None:
msg = get_system_or_developer_message(role, instructions)
msgs.append(msg)
# For user messages, add them directly even if no content.
elif role != "assistant":
msg = Message.from_role_and_contents(role, contents)
msgs.append(msg)
+22 -10
View File
@@ -32,7 +32,8 @@ from openai_harmony import Author, Message, Role, StreamableParser, TextContent
from vllm.entrypoints.openai.parser.harmony_utils import (
BUILTIN_TOOL_TO_MCP_SERVER_LABEL,
extract_function_from_recipient,
flatten_chat_text_content,
flatten_input_text_content,
get_system_or_developer_message,
is_function_recipient,
)
from vllm.entrypoints.openai.responses.protocol import (
@@ -109,8 +110,7 @@ def _parse_chat_format_message(chat_msg: dict) -> list[Message]:
name = chat_msg.get("name", "")
if name and not name.startswith("functions."):
name = f"functions.{name}"
content = chat_msg.get("content", "") or ""
content = flatten_chat_text_content(content)
content = flatten_input_text_content(chat_msg.get("content")) or ""
# NOTE: .with_recipient("assistant") is required on tool messages
# to match parse_chat_input_to_harmony_message behavior and ensure
# proper routing in the Harmony protocol.
@@ -121,7 +121,15 @@ def _parse_chat_format_message(chat_msg: dict) -> list[Message]:
)
return [msg]
# Default: user/assistant/system messages
# System/developer messages into proper DeveloperContent
if role in ("system", "developer"):
text = flatten_input_text_content(chat_msg.get("content"))
if text:
msg = get_system_or_developer_message(role, text)
return [msg]
return []
# Default: user/assistant messages
content = chat_msg.get("content", "")
if isinstance(content, str):
contents = [TextContent(text=content)]
@@ -151,13 +159,17 @@ def response_input_to_harmony(
if "type" not in response_msg or response_msg["type"] == "message":
role = response_msg["role"]
content = response_msg["content"]
# Add prefix for developer messages.
# <|start|>developer<|message|># Instructions {instructions}<|end|>
text_prefix = "Instructions:\n" if role == "developer" else ""
if isinstance(content, str):
msg = Message.from_role_and_content(role, text_prefix + content)
if role in ("system", "developer"):
text = flatten_input_text_content(content)
if text:
msg = get_system_or_developer_message(role, text)
else:
# Empty content — skip, no message emitted.
return None
elif isinstance(content, str):
msg = Message.from_role_and_content(role, content)
else:
contents = [TextContent(text=text_prefix + c["text"]) for c in content]
contents = [TextContent(text=c.get("text", "")) for c in content]
msg = Message.from_role_and_contents(role, contents)
if role == "assistant":
msg = msg.with_channel("final")
+35 -58
View File
@@ -44,8 +44,8 @@ from vllm.entrypoints.openai.engine.serving import (
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_system_message,
build_harmony_preamble,
extract_instructions_from_messages,
get_user_message,
has_custom_tools,
render_for_completion,
@@ -1078,37 +1078,9 @@ class OpenAIServingResponses(OpenAIServing):
output_items.extend(last_items)
return output_items
def _extract_system_message_from_request(
self, request: ResponsesRequest
) -> str | None:
system_msg = None
if not isinstance(request.input, str):
for response_msg in request.input:
if (
isinstance(response_msg, dict)
and response_msg.get("role") == "system"
):
content = response_msg.get("content")
if isinstance(content, str):
system_msg = content
elif isinstance(content, list):
for param in content:
if (
isinstance(param, dict)
and param.get("type") == "input_text"
):
system_msg = param.get("text")
break
break
return system_msg
def _construct_harmony_system_input_message(
self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
) -> OpenAIHarmonyMessage:
model_identity = self._extract_system_message_from_request(request)
reasoning_effort = request.reasoning.effort if request.reasoning else None
def _get_harmony_builtin_tool_descriptions(
self, request: ResponsesRequest, tool_types: set[str]
) -> dict[str, str | None]:
# Extract allowed_tools from MCP tool requests
allowed_tools_map = _extract_allowed_tools_from_mcp_requests(request.tools)
@@ -1141,17 +1113,11 @@ class OpenAIServingResponses(OpenAIServing):
and self.tool_server.has_tool("container")
else None
)
sys_msg = get_system_message(
model_identity=model_identity,
reasoning_effort=reasoning_effort,
browser_description=browser_description,
python_description=python_description,
container_description=container_description,
instructions=request.instructions,
with_custom_tools=with_custom_tools,
)
return sys_msg
return {
"browser_description": browser_description,
"python_description": python_description,
"container_description": container_description,
}
def _construct_input_messages_with_harmony(
self,
@@ -1159,20 +1125,31 @@ class OpenAIServingResponses(OpenAIServing):
prev_response: ResponsesResponse | None,
) -> list[OpenAIHarmonyMessage]:
messages: list[OpenAIHarmonyMessage] = []
request_input = request.input
if prev_response is None:
# New conversation.
tool_types = extract_tool_types(request.tools)
with_custom_tools = has_custom_tools(tool_types)
sys_msg = self._construct_harmony_system_input_message(
request, with_custom_tools, tool_types
)
messages.append(sys_msg)
if with_custom_tools:
dev_msg = get_developer_message(
instructions=request.instructions, tools=request.tools
instructions = request.instructions
if instructions is None and isinstance(request_input, list):
instructions, request_input = extract_instructions_from_messages(
request_input
)
messages.append(dev_msg)
tool_descriptions = self._get_harmony_builtin_tool_descriptions(
request, tool_types
)
tools = request.tools if with_custom_tools else None
messages.extend(
build_harmony_preamble(
instructions=instructions,
tools=tools,
reasoning_effort=(
request.reasoning.effort if request.reasoning else None
),
with_custom_tools=with_custom_tools,
**tool_descriptions,
)
)
messages += construct_harmony_previous_input_messages(request)
else:
@@ -1208,20 +1185,20 @@ class OpenAIServingResponses(OpenAIServing):
messages.extend(prev_msgs)
# Append the new input.
# Responses API supports simple text inputs without chat format.
if isinstance(request.input, str):
if isinstance(request_input, str):
# Skip empty string input when previous_input_messages supplies
# the full conversation history --- an empty trailing user message
# confuses the model into thinking nothing was sent.
if request.input or not request.previous_input_messages:
messages.append(get_user_message(request.input))
if request_input or not request.previous_input_messages:
messages.append(get_user_message(request_input))
else:
if prev_response is not None:
prev_outputs = copy(prev_response.output)
else:
prev_outputs = []
for response_msg in request.input:
for response_msg in request_input:
new_msg = response_input_to_harmony(response_msg, prev_outputs)
if new_msg is not None and new_msg.author.role != "system":
if new_msg is not None:
messages.append(new_msg)
# User passes in a tool call request and its output. We need
+15 -17
View File
@@ -18,8 +18,8 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from vllm.entrypoints.openai.models.serving import OpenAIModelRegistry
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_system_message,
build_harmony_preamble,
extract_instructions_from_messages,
parse_chat_inputs_to_harmony_messages,
render_for_completion,
)
@@ -405,6 +405,9 @@ class OpenAIServingRender:
# for more info: see comment in `maybe_serialize_tool_calls`
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
chat_messages = list(request.messages)
instructions, chat_messages = extract_instructions_from_messages(chat_messages)
# Add system message.
# NOTE: In Chat Completion API, browsing is enabled by default
# if the model supports it. TODO: Support browsing.
@@ -412,23 +415,18 @@ class OpenAIServingRender:
assert not self.supports_code_interpreter
if (reasoning_effort := request.reasoning_effort) == "none":
raise ValueError(f"Harmony does not support {reasoning_effort=}")
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=should_include_tools,
)
messages.append(sys_msg)
# Add developer message.
if request.tools:
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None # type: ignore[arg-type]
tools = request.tools if should_include_tools else None
messages.extend(
build_harmony_preamble(
instructions=instructions,
tools=tools, # type: ignore[arg-type]
reasoning_effort=reasoning_effort,
with_custom_tools=should_include_tools,
)
messages.append(dev_msg)
)
# Add user message.
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
# Add remaining conversation messages.
messages.extend(parse_chat_inputs_to_harmony_messages(chat_messages))
# Render prompt token ids.
prompt_token_ids = render_for_completion(messages)