diff --git a/tests/entrypoints/openai/chat_completion/test_serving_chat.py b/tests/entrypoints/openai/chat_completion/test_serving_chat.py index 7c0a46a4e63..22077bd4a31 100644 --- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py +++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py @@ -1364,6 +1364,57 @@ class TestServingChatWithHarmony: ], ) + @pytest.mark.asyncio + async def test_system_message_without_tools(self, serving_chat, stream): + """Leading system message produces a developer message with + DeveloperContent (# Instructions header).""" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello"}, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + { + "role": "developer", + "instructions": "You are a helpful assistant.", + }, + {"role": "user", "content": "Hello"}, + ], + ) + + @pytest.mark.asyncio + async def test_system_message_with_tools(self, serving_chat, stream, weather_tools): + """Leading system message is folded into the developer message + alongside tool definitions.""" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What's the weather?"}, + ] + req = ChatCompletionRequest( + model=MODEL_NAME, messages=messages, tools=weather_tools + ) + input_messages, _ = ( + serving_chat.openai_serving_render._make_request_with_harmony(req) + ) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + { + "role": "developer", + "instructions": "You are a helpful assistant.", + "tool_definitions": ["get_weather"], + }, + {"role": "user", "content": "What's the weather?"}, + ], + ) + @pytest.mark.asyncio async def test_tool_call_response_with_content( self, serving_chat, stream, weather_tools, weather_messages_start diff --git a/tests/entrypoints/openai/parser/test_harmony_render_parity.py b/tests/entrypoints/openai/parser/test_harmony_render_parity.py index b5ba3344990..5b771ff7bb1 100644 --- a/tests/entrypoints/openai/parser/test_harmony_render_parity.py +++ b/tests/entrypoints/openai/parser/test_harmony_render_parity.py @@ -45,6 +45,31 @@ class TestResponseInputToHarmonyRenderParity: # Single-message cases # ----------------------------------------------------------------------- + def test_developer_message(self): + """Both APIs must render developer messages identically using + DeveloperContent (with the '# Instructions' header).""" + chat_msgs = parse_chat_input_to_harmony_message( + {"role": "developer", "content": "Be concise."} + ) + resp_msgs = [ + response_input_to_harmony( + { + "type": "message", + "role": "developer", + "content": "Be concise.", + }, + prev_responses=[], + ) + ] + + expected = [{"role": "developer", "instructions": "Be concise."}] + verify_harmony_messages(chat_msgs, expected) + verify_harmony_messages(resp_msgs, expected) + + assert render_for_completion([_system()] + chat_msgs) == render_for_completion( + [_system()] + resp_msgs + ) + def test_user_message(self): chat_msgs = parse_chat_input_to_harmony_message( {"role": "user", "content": "What's the weather in Paris?"} diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 2ec200d5837..092e916f89e 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from openai_harmony import Message, Role +from openai_harmony import DeveloperContent, Message, Role from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -269,7 +269,8 @@ class TestCommonParseInputToHarmonyMessage: assert messages[0].recipient == "functions.get_current_time" def test_system_message(self, parse_function): - """Test parsing system message.""" + """Test parsing system messages, which are parsed into developer messages + with DeveloperContent.""" chat_msg = { "role": "system", "content": "You are a helpful assistant", @@ -278,9 +279,9 @@ class TestCommonParseInputToHarmonyMessage: messages = parse_function(chat_msg) assert len(messages) == 1 - # System messages are converted using Message.from_dict - # which should preserve the role - assert messages[0].author.role == Role.SYSTEM + assert messages[0].author.role == Role.DEVELOPER + assert isinstance(messages[0].content[0], DeveloperContent) + assert messages[0].content[0].instructions == "You are a helpful assistant" def test_developer_message(self, parse_function): """Test parsing developer message.""" @@ -293,6 +294,8 @@ class TestCommonParseInputToHarmonyMessage: assert len(messages) == 1 assert messages[0].author.role == Role.DEVELOPER + assert isinstance(messages[0].content[0], DeveloperContent) + assert messages[0].content[0].instructions == "Use concise language" def test_user_message_with_string_content(self, parse_function): """Test parsing user message with string content.""" diff --git a/tests/entrypoints/openai/responses/test_response_input_to_harmony.py b/tests/entrypoints/openai/responses/test_response_input_to_harmony.py index 8efd0157732..a86a1ca4e1d 100644 --- a/tests/entrypoints/openai/responses/test_response_input_to_harmony.py +++ b/tests/entrypoints/openai/responses/test_response_input_to_harmony.py @@ -12,7 +12,7 @@ from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningIt from openai.types.responses.response_reasoning_item import ( Content as ReasoningTextContent, ) -from openai_harmony import Role +from openai_harmony import DeveloperContent, Role from vllm.entrypoints.openai.responses.harmony import response_input_to_harmony @@ -65,14 +65,16 @@ class TestResponseInputToHarmonyMessage: assert msg.content[0].text == "Hello" def test_system_message(self): + """System messages carry developer instructions and must be rendered + as developer messages with DeveloperContent.""" msg = response_input_to_harmony( {"type": "message", "role": "system", "content": "Be helpful."}, prev_responses=[], ) - assert msg.author.role == Role.SYSTEM - assert msg.content[0].text == "Be helpful." - assert msg.channel is None + assert msg.author.role == Role.DEVELOPER + assert isinstance(msg.content[0], DeveloperContent) + assert msg.content[0].instructions == "Be helpful." def test_assistant_message_gets_final_channel(self): msg = response_input_to_harmony( @@ -85,14 +87,16 @@ class TestResponseInputToHarmonyMessage: assert msg.content[0].text == "The answer is 42." def test_developer_message_gets_instructions_prefix(self): + """Developer messages must use DeveloperContent which adds the + '# Instructions' header the model was trained on.""" msg = response_input_to_harmony( {"type": "message", "role": "developer", "content": "Be concise."}, prev_responses=[], ) assert msg.author.role == Role.DEVELOPER - assert msg.content[0].text == "Instructions:\nBe concise." - assert msg.channel is None + assert isinstance(msg.content[0], DeveloperContent) + assert msg.content[0].instructions == "Be concise." def test_message_with_array_content(self): msg = response_input_to_harmony( @@ -112,7 +116,9 @@ class TestResponseInputToHarmonyMessage: assert msg.content[0].text == "Part one. " assert msg.content[1].text == "Part two." - def test_developer_message_array_content_gets_prefix_on_each_part(self): + def test_developer_message_array_content_concatenated(self): + """Array content in developer messages is flattened and rendered + via DeveloperContent with the '# Instructions' header.""" msg = response_input_to_harmony( { "type": "message", @@ -125,8 +131,9 @@ class TestResponseInputToHarmonyMessage: prev_responses=[], ) - assert msg.content[0].text == "Instructions:\nRule 1." - assert msg.content[1].text == "Instructions:\nRule 2." + assert msg.author.role == Role.DEVELOPER + assert isinstance(msg.content[0], DeveloperContent) + assert msg.content[0].instructions == "Rule 1.Rule 2." # ----------------------------------------------------------------------- # type="reasoning" diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py index a791cab2a0c..36056a44d07 100644 --- a/tests/entrypoints/openai/utils.py +++ b/tests/entrypoints/openai/utils.py @@ -155,6 +155,8 @@ def verify_harmony_messages( assert msg.content[0].text == expected["content"] if "content_type" in expected: assert msg.content_type == expected["content_type"] + if "instructions" in expected: + assert msg.content[0].instructions == expected["instructions"] if "tool_definitions" in expected: # Check that the tool definitions match the expected list of tool names actual_tools = [t.name for t in msg.content[0].tools["functions"].tools] diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index e76fa38d3c3..cbd1f6bb78d 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -3,6 +3,7 @@ import datetime from collections.abc import Iterable, Sequence +from typing import Any from openai.types.responses.tool import Tool from openai_harmony import ( @@ -195,6 +196,12 @@ def get_user_message(content: str) -> Message: return Message.from_role_and_content(Role.USER, content) +def get_system_or_developer_message(role: str, instructions: str) -> Message: + if role == "system" and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: + return get_system_message(instructions=instructions) + return get_developer_message(instructions=instructions) + + def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]: """ Parse a list of messages from request.messages in the Chat Completion API to @@ -249,18 +256,96 @@ def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]: return cleaned_msgs -def flatten_chat_text_content(content: str | list | None) -> str | None: +def flatten_input_text_content(content: Any) -> str | None: """ - Extract the text parts from a chat message content field and flatten them - into a single string. + Extract text parts from a Chat Completion or Responses API content field and + flatten them into a single string. Returns None if no text content is found. """ - if isinstance(content, list): - return "".join( - item.get("text", "") - for item in content - if isinstance(item, dict) and item.get("type") == "text" + if content is None or isinstance(content, str): + return content + if not isinstance(content, list): + return None + + texts: list[str] = [] + for item in content: + if isinstance(item, str): + texts.append(item) + continue + if isinstance(item, dict): + text = item.get("text") + if text is not None: + texts.append(text) + return "".join(texts) if texts else None + + +def extract_instructions_from_messages( + messages: Sequence[Any], +) -> tuple[str | None, list[Any]]: + """ + Peel a leading system/developer Chat Completion or Responses message and + flatten its instruction text. + """ + remaining_messages = list(messages) + if not remaining_messages: + return None, remaining_messages + + first_message = remaining_messages[0] + if not isinstance(first_message, dict): + if hasattr(first_message, "to_dict"): + # Handle OpenAI Harmony Message + first_message = first_message.to_dict() + elif hasattr(first_message, "model_dump"): + first_message = first_message.model_dump(exclude_none=True) + else: + raise ValueError(f"Unknown message type: {type(first_message)}") + + if first_message.get("role") not in ( + "system", + "developer", + ): + return None, remaining_messages + + instructions = flatten_input_text_content(first_message.get("content")) + return instructions, remaining_messages[1:] + + +def build_harmony_preamble( + *, + instructions: str | None = None, + tools: list[Tool | ChatCompletionToolsParam] | None = None, + reasoning_effort: str | None = None, + browser_description: str | None = None, + python_description: str | None = None, + container_description: str | None = None, + with_custom_tools: bool = False, +) -> list[Message]: + """ + Build the standard Harmony system/developer prefix for a request. + """ + developer_instructions = system_instructions = None + if envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: + system_instructions = instructions + else: + developer_instructions = instructions + + messages = [ + get_system_message( + reasoning_effort=reasoning_effort, + browser_description=browser_description, + python_description=python_description, + container_description=container_description, + instructions=system_instructions, + with_custom_tools=with_custom_tools, ) - return content + ] + if developer_instructions or tools: + messages.append( + get_developer_message( + instructions=developer_instructions, + tools=tools, + ) + ) + return messages def parse_chat_input_to_harmony_message( @@ -283,7 +368,7 @@ def parse_chat_input_to_harmony_message( tool_calls = chat_msg.get("tool_calls", []) if role == "assistant" and tool_calls: - content = flatten_chat_text_content(chat_msg.get("content")) + content = flatten_input_text_content(chat_msg.get("content")) if content: commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content) commentary_msg = commentary_msg.with_channel("commentary") @@ -313,8 +398,7 @@ def parse_chat_input_to_harmony_message( if role == "tool": tool_call_id = chat_msg.get("tool_call_id", "") name = tool_id_names.get(tool_call_id, "") - content = chat_msg.get("content", "") or "" - content = flatten_chat_text_content(content) + content = flatten_input_text_content(chat_msg.get("content")) or "" msg = ( Message.from_author_and_content( @@ -349,7 +433,12 @@ def parse_chat_input_to_harmony_message( # Send non-tool assistant messages to the final channel msg = msg.with_channel("final") msgs.append(msg) - # For user/system/developer messages, add them directly even if no content. + elif role in ("system", "developer"): + instructions = flatten_input_text_content(chat_msg.get("content")) + if instructions is not None: + msg = get_system_or_developer_message(role, instructions) + msgs.append(msg) + # For user messages, add them directly even if no content. elif role != "assistant": msg = Message.from_role_and_contents(role, contents) msgs.append(msg) diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py index cfe5fb67bd2..8dee0d993d5 100644 --- a/vllm/entrypoints/openai/responses/harmony.py +++ b/vllm/entrypoints/openai/responses/harmony.py @@ -32,7 +32,8 @@ from openai_harmony import Author, Message, Role, StreamableParser, TextContent from vllm.entrypoints.openai.parser.harmony_utils import ( BUILTIN_TOOL_TO_MCP_SERVER_LABEL, extract_function_from_recipient, - flatten_chat_text_content, + flatten_input_text_content, + get_system_or_developer_message, is_function_recipient, ) from vllm.entrypoints.openai.responses.protocol import ( @@ -109,8 +110,7 @@ def _parse_chat_format_message(chat_msg: dict) -> list[Message]: name = chat_msg.get("name", "") if name and not name.startswith("functions."): name = f"functions.{name}" - content = chat_msg.get("content", "") or "" - content = flatten_chat_text_content(content) + content = flatten_input_text_content(chat_msg.get("content")) or "" # NOTE: .with_recipient("assistant") is required on tool messages # to match parse_chat_input_to_harmony_message behavior and ensure # proper routing in the Harmony protocol. @@ -121,7 +121,15 @@ def _parse_chat_format_message(chat_msg: dict) -> list[Message]: ) return [msg] - # Default: user/assistant/system messages + # System/developer messages into proper DeveloperContent + if role in ("system", "developer"): + text = flatten_input_text_content(chat_msg.get("content")) + if text: + msg = get_system_or_developer_message(role, text) + return [msg] + return [] + + # Default: user/assistant messages content = chat_msg.get("content", "") if isinstance(content, str): contents = [TextContent(text=content)] @@ -151,13 +159,17 @@ def response_input_to_harmony( if "type" not in response_msg or response_msg["type"] == "message": role = response_msg["role"] content = response_msg["content"] - # Add prefix for developer messages. - # <|start|>developer<|message|># Instructions {instructions}<|end|> - text_prefix = "Instructions:\n" if role == "developer" else "" - if isinstance(content, str): - msg = Message.from_role_and_content(role, text_prefix + content) + if role in ("system", "developer"): + text = flatten_input_text_content(content) + if text: + msg = get_system_or_developer_message(role, text) + else: + # Empty content — skip, no message emitted. + return None + elif isinstance(content, str): + msg = Message.from_role_and_content(role, content) else: - contents = [TextContent(text=text_prefix + c["text"]) for c in content] + contents = [TextContent(text=c.get("text", "")) for c in content] msg = Message.from_role_and_contents(role, contents) if role == "assistant": msg = msg.with_channel("final") diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 112328def21..7ae57ac3578 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -44,8 +44,8 @@ from vllm.entrypoints.openai.engine.serving import ( ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( - get_developer_message, - get_system_message, + build_harmony_preamble, + extract_instructions_from_messages, get_user_message, has_custom_tools, render_for_completion, @@ -1078,37 +1078,9 @@ class OpenAIServingResponses(OpenAIServing): output_items.extend(last_items) return output_items - def _extract_system_message_from_request( - self, request: ResponsesRequest - ) -> str | None: - system_msg = None - if not isinstance(request.input, str): - for response_msg in request.input: - if ( - isinstance(response_msg, dict) - and response_msg.get("role") == "system" - ): - content = response_msg.get("content") - if isinstance(content, str): - system_msg = content - elif isinstance(content, list): - for param in content: - if ( - isinstance(param, dict) - and param.get("type") == "input_text" - ): - system_msg = param.get("text") - break - break - return system_msg - - def _construct_harmony_system_input_message( - self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str] - ) -> OpenAIHarmonyMessage: - model_identity = self._extract_system_message_from_request(request) - - reasoning_effort = request.reasoning.effort if request.reasoning else None - + def _get_harmony_builtin_tool_descriptions( + self, request: ResponsesRequest, tool_types: set[str] + ) -> dict[str, str | None]: # Extract allowed_tools from MCP tool requests allowed_tools_map = _extract_allowed_tools_from_mcp_requests(request.tools) @@ -1141,17 +1113,11 @@ class OpenAIServingResponses(OpenAIServing): and self.tool_server.has_tool("container") else None ) - - sys_msg = get_system_message( - model_identity=model_identity, - reasoning_effort=reasoning_effort, - browser_description=browser_description, - python_description=python_description, - container_description=container_description, - instructions=request.instructions, - with_custom_tools=with_custom_tools, - ) - return sys_msg + return { + "browser_description": browser_description, + "python_description": python_description, + "container_description": container_description, + } def _construct_input_messages_with_harmony( self, @@ -1159,20 +1125,31 @@ class OpenAIServingResponses(OpenAIServing): prev_response: ResponsesResponse | None, ) -> list[OpenAIHarmonyMessage]: messages: list[OpenAIHarmonyMessage] = [] + request_input = request.input if prev_response is None: # New conversation. tool_types = extract_tool_types(request.tools) with_custom_tools = has_custom_tools(tool_types) - - sys_msg = self._construct_harmony_system_input_message( - request, with_custom_tools, tool_types - ) - messages.append(sys_msg) - if with_custom_tools: - dev_msg = get_developer_message( - instructions=request.instructions, tools=request.tools + instructions = request.instructions + if instructions is None and isinstance(request_input, list): + instructions, request_input = extract_instructions_from_messages( + request_input ) - messages.append(dev_msg) + tool_descriptions = self._get_harmony_builtin_tool_descriptions( + request, tool_types + ) + tools = request.tools if with_custom_tools else None + messages.extend( + build_harmony_preamble( + instructions=instructions, + tools=tools, + reasoning_effort=( + request.reasoning.effort if request.reasoning else None + ), + with_custom_tools=with_custom_tools, + **tool_descriptions, + ) + ) messages += construct_harmony_previous_input_messages(request) else: @@ -1208,20 +1185,20 @@ class OpenAIServingResponses(OpenAIServing): messages.extend(prev_msgs) # Append the new input. # Responses API supports simple text inputs without chat format. - if isinstance(request.input, str): + if isinstance(request_input, str): # Skip empty string input when previous_input_messages supplies # the full conversation history --- an empty trailing user message # confuses the model into thinking nothing was sent. - if request.input or not request.previous_input_messages: - messages.append(get_user_message(request.input)) + if request_input or not request.previous_input_messages: + messages.append(get_user_message(request_input)) else: if prev_response is not None: prev_outputs = copy(prev_response.output) else: prev_outputs = [] - for response_msg in request.input: + for response_msg in request_input: new_msg = response_input_to_harmony(response_msg, prev_outputs) - if new_msg is not None and new_msg.author.role != "system": + if new_msg is not None: messages.append(new_msg) # User passes in a tool call request and its output. We need diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index e8e0c254460..9b51bc53daa 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -18,8 +18,8 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.models.serving import OpenAIModelRegistry from vllm.entrypoints.openai.parser.harmony_utils import ( - get_developer_message, - get_system_message, + build_harmony_preamble, + extract_instructions_from_messages, parse_chat_inputs_to_harmony_messages, render_for_completion, ) @@ -405,6 +405,9 @@ class OpenAIServingRender: # for more info: see comment in `maybe_serialize_tool_calls` _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + chat_messages = list(request.messages) + instructions, chat_messages = extract_instructions_from_messages(chat_messages) + # Add system message. # NOTE: In Chat Completion API, browsing is enabled by default # if the model supports it. TODO: Support browsing. @@ -412,23 +415,18 @@ class OpenAIServingRender: assert not self.supports_code_interpreter if (reasoning_effort := request.reasoning_effort) == "none": raise ValueError(f"Harmony does not support {reasoning_effort=}") - sys_msg = get_system_message( - reasoning_effort=reasoning_effort, - browser_description=None, - python_description=None, - with_custom_tools=should_include_tools, - ) - messages.append(sys_msg) - - # Add developer message. - if request.tools: - dev_msg = get_developer_message( - tools=request.tools if should_include_tools else None # type: ignore[arg-type] + tools = request.tools if should_include_tools else None + messages.extend( + build_harmony_preamble( + instructions=instructions, + tools=tools, # type: ignore[arg-type] + reasoning_effort=reasoning_effort, + with_custom_tools=should_include_tools, ) - messages.append(dev_msg) + ) - # Add user message. - messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) + # Add remaining conversation messages. + messages.extend(parse_chat_inputs_to_harmony_messages(chat_messages)) # Render prompt token ids. prompt_token_ids = render_for_completion(messages)