diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c7338b4828d..97fc9c2bb91 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1299,12 +1299,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/llm - - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py - - pytest -v -s entrypoints/offline_mode + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Integration (Pooling) # TBD timeout_in_minutes: 180 @@ -1346,7 +1345,7 @@ steps: - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: OpenAI API correctness # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 548174ed748..613cb76eb4e 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -11,7 +11,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: Entrypoints Integration (LLM) key: entrypoints-integration-llm @@ -20,12 +20,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/llm - - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests mirror: amd: device: mi325_1 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index beaaa5d8642..a8947fe2324 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -34,10 +34,11 @@ /vllm/entrypoints/speech_to_text/realtime @njhill /vllm/entrypoints/speech_to_text @NickLucche /vllm/entrypoints/pooling @noooop -/vllm/entrypoints/sagemaker @DarkLight1337 +/vllm/entrypoints/serve/sagemaker @DarkLight1337 /vllm/entrypoints/serve @njhill /vllm/entrypoints/*.py @njhill /vllm/entrypoints/chat_utils.py @DarkLight1337 +/vllm/entrypoints/offline_utils.py @DarkLight1337 /vllm/entrypoints/llm.py @DarkLight1337 # Rust Frontend diff --git a/tests/entrypoints/offline_mode/__init__.py b/tests/entrypoints/llm/offline_mode/__init__.py similarity index 100% rename from tests/entrypoints/offline_mode/__init__.py rename to tests/entrypoints/llm/offline_mode/__init__.py diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/llm/offline_mode/test_offline_mode.py similarity index 100% rename from tests/entrypoints/offline_mode/test_offline_mode.py rename to tests/entrypoints/llm/offline_mode/test_offline_mode.py diff --git a/tests/entrypoints/sagemaker/__init__.py b/tests/entrypoints/serve/sagemaker/__init__.py similarity index 100% rename from tests/entrypoints/sagemaker/__init__.py rename to tests/entrypoints/serve/sagemaker/__init__.py diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/serve/sagemaker/conftest.py similarity index 97% rename from tests/entrypoints/sagemaker/conftest.py rename to tests/entrypoints/serve/sagemaker/conftest.py index 1c34d738fa7..d36c20ccd9a 100644 --- a/tests/entrypoints/sagemaker/conftest.py +++ b/tests/entrypoints/serve/sagemaker/conftest.py @@ -6,7 +6,7 @@ import pytest import pytest_asyncio -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer # Model name constants used across tests MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct" diff --git a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py index 0d4f8e88582..ebc51056bb3 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_handler_overrides.py @@ -22,7 +22,8 @@ import tempfile import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( MODEL_NAME_SMOLLM, ) diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py index 01b3e650222..4a7d8640366 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_lora_adapters.py @@ -4,7 +4,8 @@ import openai # use the official async_client for correctness check import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import MODEL_NAME_SMOLLM diff --git a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py index f1ed0c7e289..bc7574d6503 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_middleware_integration.py @@ -12,7 +12,8 @@ import tempfile import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( MODEL_NAME_SMOLLM, ) diff --git a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py b/tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py similarity index 99% rename from tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py rename to tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py index 6206000385b..7267b4265cc 100644 --- a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py +++ b/tests/entrypoints/serve/sagemaker/test_sagemaker_stateful_sessions.py @@ -6,7 +6,8 @@ import openai # use the official client for correctness check import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer + from .conftest import ( HEADER_SAGEMAKER_CLOSED_SESSION_ID, HEADER_SAGEMAKER_NEW_SESSION_ID, diff --git a/tests/entrypoints/serve/utils/__init__.py b/tests/entrypoints/serve/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/serve/utils/test_api_utils.py similarity index 98% rename from tests/entrypoints/test_utils.py rename to tests/entrypoints/serve/utils/test_api_utils.py index ff65066ffd2..2dc6f76da6d 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/serve/utils/test_api_utils.py @@ -4,7 +4,7 @@ import pytest from vllm.entrypoints.openai.engine.protocol import StreamOptions -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( get_max_tokens, sanitize_message, should_include_usage, diff --git a/tests/entrypoints/openai/test_fingerprint.py b/tests/entrypoints/serve/utils/test_fingerprint.py similarity index 97% rename from tests/entrypoints/openai/test_fingerprint.py rename to tests/entrypoints/serve/utils/test_fingerprint.py index b78ed38636c..46ec6255f4e 100644 --- a/tests/entrypoints/openai/test_fingerprint.py +++ b/tests/entrypoints/serve/utils/test_fingerprint.py @@ -6,7 +6,7 @@ from types import SimpleNamespace import pytest -from vllm.entrypoints.openai import fingerprint as fp +from vllm.entrypoints.serve.utils import fingerprint as fp def _cfg(tp=1, pp=1, dp=1, ep=False, digest="a3b21f94deadbeef"): diff --git a/tests/entrypoints/serve/utils/test_request_logger.py b/tests/entrypoints/serve/utils/test_request_logger.py new file mode 100644 index 00000000000..c17f2471e48 --- /dev/null +++ b/tests/entrypoints/serve/utils/test_request_logger.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock, patch + +from vllm.entrypoints.serve.utils.request_logger import RequestLogger + + +def test_request_logger_log_outputs(): + """Test the new log_outputs functionality.""" + # Create a mock logger to capture log calls + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test basic output logging + request_logger.log_outputs( + request_id="test-123", + outputs="Hello, world!", + output_token_ids=[1, 2, 3, 4], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-123" + assert call_args[3] == "Hello, world!" + assert call_args[4] == [1, 2, 3, 4] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_streaming_delta(): + """Test log_outputs with streaming delta mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming delta logging + request_logger.log_outputs( + request_id="test-456", + outputs="Hello", + output_token_ids=[1], + finish_reason=None, + is_streaming=True, + delta=True, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-456" + assert call_args[2] == " (streaming delta)" + assert call_args[3] == "Hello" + assert call_args[4] == [1] + assert call_args[5] is None + + +def test_request_logger_log_outputs_streaming_complete(): + """Test log_outputs with streaming complete mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming complete logging + request_logger.log_outputs( + request_id="test-789", + outputs="Complete response", + output_token_ids=[1, 2, 3], + finish_reason="length", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-789" + assert call_args[2] == " (streaming complete)" + assert call_args[3] == "Complete response" + assert call_args[4] == [1, 2, 3] + assert call_args[5] == "length" + + +def test_request_logger_log_outputs_with_truncation(): + """Test log_outputs respects max_log_len setting.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + # Set max_log_len to 10 + request_logger = RequestLogger(max_log_len=10) + + # Test output truncation + long_output = "This is a very long output that should be truncated" + long_token_ids = list(range(20)) # 20 tokens + + request_logger.log_outputs( + request_id="test-truncate", + outputs=long_output, + output_token_ids=long_token_ids, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args + + # Check that output was truncated to first 10 characters + logged_output = call_args[0][3] + assert logged_output == "This is a " + assert len(logged_output) == 10 + + # Check that token IDs were truncated to first 10 tokens + logged_token_ids = call_args[0][4] + assert logged_token_ids == list(range(10)) + assert len(logged_token_ids) == 10 + + +def test_request_logger_log_outputs_none_values(): + """Test log_outputs handles None values correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with None output_token_ids + request_logger.log_outputs( + request_id="test-none", + outputs="Test output", + output_token_ids=None, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-none" + assert call_args[3] == "Test output" + assert call_args[4] is None + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_empty_output(): + """Test log_outputs handles empty output correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=5) + + # Test with empty output + request_logger.log_outputs( + request_id="test-empty", + outputs="", + output_token_ids=[], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-empty" + assert call_args[3] == "" + assert call_args[4] == [] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_integration(): + """Test that log_outputs can be called alongside log_inputs.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test that both methods can be called without interference + request_logger.log_inputs( + request_id="test-integration", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_embeds=None, + params=None, + lora_request=None, + ) + + request_logger.log_outputs( + request_id="test-integration", + outputs="Test output", + output_token_ids=[4, 5, 6], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + # Should have been called twice - once for inputs, once for outputs + assert mock_logger.info.call_count == 2 + + # Check that the calls were made with correct patterns + input_call = mock_logger.info.call_args_list[0][0] + output_call = mock_logger.info.call_args_list[1][0] + + assert "Received request %s" in input_call[0] + assert input_call[1] == "test-integration" + + assert "Generated response %s%s" in output_call[0] + assert output_call[1] == "test-integration" + + +def test_streaming_complete_logs_full_text_content(): + """Test that streaming complete logging includes + full accumulated text, not just token count.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with actual content instead of token count format + full_response = "This is a complete response from streaming" + request_logger.log_outputs( + request_id="test-streaming-full-text", + outputs=full_response, + output_token_ids=None, + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + + # Verify the logged output is the full text, not a token count format + logged_output = call_args[3] + assert logged_output == full_response + assert "tokens>" not in logged_output + assert "streaming_complete" not in logged_output + + # Verify other parameters + assert call_args[1] == "test-streaming-full-text" + assert call_args[2] == " (streaming complete)" + assert call_args[5] == "streaming_complete" diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/serve/utils/test_ssl_cert_refresher.py similarity index 96% rename from tests/entrypoints/test_ssl_cert_refresher.py rename to tests/entrypoints/serve/utils/test_ssl_cert_refresher.py index b56fbd9fee7..57a856ce118 100644 --- a/tests/entrypoints/test_ssl_cert_refresher.py +++ b/tests/entrypoints/serve/utils/test_ssl_cert_refresher.py @@ -7,7 +7,7 @@ from ssl import SSLContext import pytest -from vllm.entrypoints.ssl import SSLCertRefresher +from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher class MockSSLContext(SSLContext): diff --git a/tests/test_logger.py b/tests/test_logger.py index b4f44f52d4d..2ff100151b2 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -10,12 +10,11 @@ from dataclasses import dataclass from json.decoder import JSONDecodeError from tempfile import NamedTemporaryFile from typing import Any -from unittest.mock import MagicMock, patch +from unittest.mock import patch from uuid import uuid4 import pytest -from vllm.entrypoints.logger import RequestLogger from vllm.logger import ( _DATE_FORMAT, _FORMAT, @@ -269,248 +268,6 @@ def test_prepare_object_to_dump(): assert prepare_object_to_dump(CustomClass(1, "b")) == "CustomClass(a=1, b='b')" -def test_request_logger_log_outputs(): - """Test the new log_outputs functionality.""" - # Create a mock logger to capture log calls - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test basic output logging - request_logger.log_outputs( - request_id="test-123", - outputs="Hello, world!", - output_token_ids=[1, 2, 3, 4], - finish_reason="stop", - is_streaming=False, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - assert "Generated response %s%s" in call_args[0] - assert call_args[1] == "test-123" - assert call_args[3] == "Hello, world!" - assert call_args[4] == [1, 2, 3, 4] - assert call_args[5] == "stop" - - -def test_request_logger_log_outputs_streaming_delta(): - """Test log_outputs with streaming delta mode.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test streaming delta logging - request_logger.log_outputs( - request_id="test-456", - outputs="Hello", - output_token_ids=[1], - finish_reason=None, - is_streaming=True, - delta=True, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - assert "Generated response %s%s" in call_args[0] - assert call_args[1] == "test-456" - assert call_args[2] == " (streaming delta)" - assert call_args[3] == "Hello" - assert call_args[4] == [1] - assert call_args[5] is None - - -def test_request_logger_log_outputs_streaming_complete(): - """Test log_outputs with streaming complete mode.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test streaming complete logging - request_logger.log_outputs( - request_id="test-789", - outputs="Complete response", - output_token_ids=[1, 2, 3], - finish_reason="length", - is_streaming=True, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - assert "Generated response %s%s" in call_args[0] - assert call_args[1] == "test-789" - assert call_args[2] == " (streaming complete)" - assert call_args[3] == "Complete response" - assert call_args[4] == [1, 2, 3] - assert call_args[5] == "length" - - -def test_request_logger_log_outputs_with_truncation(): - """Test log_outputs respects max_log_len setting.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - # Set max_log_len to 10 - request_logger = RequestLogger(max_log_len=10) - - # Test output truncation - long_output = "This is a very long output that should be truncated" - long_token_ids = list(range(20)) # 20 tokens - - request_logger.log_outputs( - request_id="test-truncate", - outputs=long_output, - output_token_ids=long_token_ids, - finish_reason="stop", - is_streaming=False, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args - - # Check that output was truncated to first 10 characters - logged_output = call_args[0][3] - assert logged_output == "This is a " - assert len(logged_output) == 10 - - # Check that token IDs were truncated to first 10 tokens - logged_token_ids = call_args[0][4] - assert logged_token_ids == list(range(10)) - assert len(logged_token_ids) == 10 - - -def test_request_logger_log_outputs_none_values(): - """Test log_outputs handles None values correctly.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test with None output_token_ids - request_logger.log_outputs( - request_id="test-none", - outputs="Test output", - output_token_ids=None, - finish_reason="stop", - is_streaming=False, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - assert "Generated response %s%s" in call_args[0] - assert call_args[1] == "test-none" - assert call_args[3] == "Test output" - assert call_args[4] is None - assert call_args[5] == "stop" - - -def test_request_logger_log_outputs_empty_output(): - """Test log_outputs handles empty output correctly.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=5) - - # Test with empty output - request_logger.log_outputs( - request_id="test-empty", - outputs="", - output_token_ids=[], - finish_reason="stop", - is_streaming=False, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - assert "Generated response %s%s" in call_args[0] - assert call_args[1] == "test-empty" - assert call_args[3] == "" - assert call_args[4] == [] - assert call_args[5] == "stop" - - -def test_request_logger_log_outputs_integration(): - """Test that log_outputs can be called alongside log_inputs.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test that both methods can be called without interference - request_logger.log_inputs( - request_id="test-integration", - prompt="Test prompt", - prompt_token_ids=[1, 2, 3], - prompt_embeds=None, - params=None, - lora_request=None, - ) - - request_logger.log_outputs( - request_id="test-integration", - outputs="Test output", - output_token_ids=[4, 5, 6], - finish_reason="stop", - is_streaming=False, - delta=False, - ) - - # Should have been called twice - once for inputs, once for outputs - assert mock_logger.info.call_count == 2 - - # Check that the calls were made with correct patterns - input_call = mock_logger.info.call_args_list[0][0] - output_call = mock_logger.info.call_args_list[1][0] - - assert "Received request %s" in input_call[0] - assert input_call[1] == "test-integration" - - assert "Generated response %s%s" in output_call[0] - assert output_call[1] == "test-integration" - - -def test_streaming_complete_logs_full_text_content(): - """Test that streaming complete logging includes - full accumulated text, not just token count.""" - mock_logger = MagicMock() - - with patch("vllm.entrypoints.logger.logger", mock_logger): - request_logger = RequestLogger(max_log_len=None) - - # Test with actual content instead of token count format - full_response = "This is a complete response from streaming" - request_logger.log_outputs( - request_id="test-streaming-full-text", - outputs=full_response, - output_token_ids=None, - finish_reason="streaming_complete", - is_streaming=True, - delta=False, - ) - - mock_logger.info.assert_called_once() - call_args = mock_logger.info.call_args.args - - # Verify the logged output is the full text, not a token count format - logged_output = call_args[3] - assert logged_output == full_response - assert "tokens>" not in logged_output - assert "streaming_complete" not in logged_output - - # Verify other parameters - assert call_args[1] == "test-streaming-full-text" - assert call_args[2] == " (streaming complete)" - assert call_args[5] == "streaming_complete" - - # Add vllm prefix to make sure logs go through the vllm logger test_logger = init_logger("vllm.test_logger") diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py index 75549105fa9..a30f2ab0182 100644 --- a/vllm/benchmarks/sweep/cli.py +++ b/vllm/benchmarks/sweep/cli.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG from .plot import SweepPlotArgs from .plot import main as plot_main diff --git a/vllm/entrypoints/anthropic/api_router.py b/vllm/entrypoints/anthropic/api_router.py index 1fe2be89962..50a8dae9ec7 100644 --- a/vllm/entrypoints/anthropic/api_router.py +++ b/vllm/entrypoints/anthropic/api_router.py @@ -17,9 +17,9 @@ from vllm.entrypoints.anthropic.protocol import ( ) from vllm.entrypoints.anthropic.serving import AnthropicServingMessages from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) from vllm.logger import init_logger diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index 2bdec6f4ec3..806261b597b 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -29,7 +29,6 @@ from vllm.entrypoints.anthropic.protocol import ( AnthropicUsage, ) from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, @@ -45,6 +44,7 @@ from vllm.entrypoints.openai.engine.protocol import ( StreamOptions, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger if TYPE_CHECKING: from vllm.entrypoints.serve.render.serving import OpenAIServingRender diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 7512723515e..f950b52d881 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -22,7 +22,7 @@ import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.launcher import serve_http -from vllm.entrypoints.utils import with_cancellation +from vllm.entrypoints.serve.utils.api_utils import with_cancellation from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index f64de4cf673..1afac64b148 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -7,7 +7,7 @@ import typing from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase from vllm.entrypoints.cli.types import CLISubcommand -from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG if typing.TYPE_CHECKING: from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index 0af9f32c3ee..50e46d81cc9 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -18,7 +18,7 @@ from vllm.entrypoints.openai.cli_args import ( make_arg_parser, validate_parsed_serve_args, ) -from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG from vllm.logger import init_logger from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index ac7f9e0a7e0..fe0b339b3ed 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -21,7 +21,10 @@ def main(): import vllm.entrypoints.cli.openai import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve - from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup + from vllm.entrypoints.serve.utils.api_utils import ( + VLLM_SUBCMD_PARSER_EPILOG, + cli_env_setup, + ) from vllm.utils.argparse_utils import FlexibleArgumentParser CMD_MODULES = [ diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py index 64d1bec1f1f..85253adde14 100644 --- a/vllm/entrypoints/cli/run_batch.py +++ b/vllm/entrypoints/cli/run_batch.py @@ -7,7 +7,7 @@ import importlib.metadata import typing from vllm.entrypoints.cli.types import CLISubcommand -from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG from vllm.logger import init_logger if typing.TYPE_CHECKING: diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index ea4bf1b62d1..415ab549cc7 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -15,7 +15,7 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_se from vllm.entrypoints.openai.dp_supervisor import ( run_dp_supervisor, ) -from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/generate/api_router.py b/vllm/entrypoints/generate/api_router.py index 713e2566bc5..b1e6cea44fe 100644 --- a/vllm/entrypoints/generate/api_router.py +++ b/vllm/entrypoints/generate/api_router.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from starlette.datastructures import State from vllm.engine.protocol import EngineClient - from vllm.entrypoints.logger import RequestLogger + from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.tasks import SupportedTask else: RequestLogger = object @@ -65,9 +65,9 @@ async def init_generate_state( ) from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion - from vllm.entrypoints.openai.fingerprint import set_default_fingerprint_mode from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.serve.disagg.serving import ServingTokens + from vllm.entrypoints.serve.utils.fingerprint import set_default_fingerprint_mode # Applied before any serving class is constructed so that each one picks # up the chosen mode on its first cache miss. diff --git a/vllm/entrypoints/generate/factories.py b/vllm/entrypoints/generate/factories.py index 899601db3ca..8c963edc618 100644 --- a/vllm/entrypoints/generate/factories.py +++ b/vllm/entrypoints/generate/factories.py @@ -6,7 +6,7 @@ from vllm.config import ModelConfig from vllm.tasks import SupportedTask if TYPE_CHECKING: - from vllm.entrypoints.sagemaker.api_router import ( + from vllm.entrypoints.serve.sagemaker.api_router import ( EndpointFn, GetHandlerFn, RequestType, diff --git a/vllm/entrypoints/generate/generative_scoring/api_router.py b/vllm/entrypoints/generate/generative_scoring/api_router.py index e6918b7f03b..480dac822f1 100644 --- a/vllm/entrypoints/generate/generative_scoring/api_router.py +++ b/vllm/entrypoints/generate/generative_scoring/api_router.py @@ -10,8 +10,11 @@ from vllm.entrypoints.generate.generative_scoring.serving import ( ServingGenerativeScoring, ) from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.serve.utils.api_utils import ( + load_aware_call, + validate_json_request, + with_cancellation, +) from vllm.logger import init_logger router = APIRouter() diff --git a/vllm/entrypoints/generate/generative_scoring/serving.py b/vllm/entrypoints/generate/generative_scoring/serving.py index 0592d0b29af..f656755ac03 100644 --- a/vllm/entrypoints/generate/generative_scoring/serving.py +++ b/vllm/entrypoints/generate/generative_scoring/serving.py @@ -18,7 +18,6 @@ from fastapi import Request from pydantic import Field from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, OpenAIBaseModel, @@ -26,6 +25,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.inputs import EngineInput, tokens_input from vllm.logger import init_logger from vllm.outputs import RequestOutput diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index b9173b302ca..59269dd1802 100644 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -43,7 +43,7 @@ import uvloop from vllm import envs from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.utils import log_version_and_model +from vllm.entrypoints.serve.utils.api_utils import log_version_and_model from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 8caeb80836f..a560db87ea2 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -12,11 +12,11 @@ from fastapi import FastAPI from vllm import envs from vllm.engine.protocol import EngineClient -from vllm.entrypoints.constants import ( +from vllm.entrypoints.serve.utils.constants import ( H11_MAX_HEADER_COUNT_DEFAULT, H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT, ) -from vllm.entrypoints.ssl import SSLCertRefresher +from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher from vllm.logger import init_logger from vllm.utils.network_utils import find_process_using_port diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 802d7a6d796..7297243f918 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -40,7 +40,7 @@ from vllm.entrypoints.chat_utils import ( ) from vllm.entrypoints.generate.beam_search.offline import BeamSearchOfflineMixin from vllm.entrypoints.pooling.offline import PoolingOfflineMixin -from vllm.entrypoints.utils import log_non_default_args +from vllm.entrypoints.serve.utils.api_utils import log_non_default_args from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 892f9d82d70..ad008d02f6b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -27,12 +27,22 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.engine.protocol import GenerationError from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.entrypoints.openai.server_utils import ( +from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware +from vllm.entrypoints.serve.render.serving import OpenAIServingRender +from vllm.entrypoints.serve.sagemaker.api_router import sagemaker_standards_bootstrap +from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization +from vllm.entrypoints.serve.utils.api_utils import ( + cli_env_setup, + log_non_default_args, + log_version_and_model, + process_lora_modules, +) +from vllm.entrypoints.serve.utils.request_logger import RequestLogger +from vllm.entrypoints.serve.utils.server_utils import ( engine_error_handler, exception_handler, generation_error_handler, @@ -42,16 +52,6 @@ from vllm.entrypoints.openai.server_utils import ( log_response, validation_exception_handler, ) -from vllm.entrypoints.sagemaker.api_router import sagemaker_standards_bootstrap -from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware -from vllm.entrypoints.serve.render.serving import OpenAIServingRender -from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization -from vllm.entrypoints.utils import ( - cli_env_setup, - log_non_default_args, - log_version_and_model, - process_lora_modules, -) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.tasks import POOLING_TASKS, SupportedTask @@ -187,7 +187,7 @@ def build_app( register_models_api_router(app) - from vllm.entrypoints.sagemaker.api_router import ( + from vllm.entrypoints.serve.sagemaker.api_router import ( attach_router as register_sagemaker_api_router, ) @@ -254,12 +254,12 @@ def build_app( # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]: - from vllm.entrypoints.openai.server_utils import AuthenticationMiddleware + from vllm.entrypoints.serve.utils.server_utils import AuthenticationMiddleware app.add_middleware(AuthenticationMiddleware, tokens=tokens) if args.enable_request_id_headers: - from vllm.entrypoints.openai.server_utils import XRequestIdMiddleware + from vllm.entrypoints.serve.utils.server_utils import XRequestIdMiddleware app.add_middleware(XRequestIdMiddleware) diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index cdaaa27fcda..6f3289ede42 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -15,12 +15,12 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ) from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.orca_metrics import metrics_header -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) +from vllm.entrypoints.serve.utils.orca_metrics import metrics_header from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/chat_completion/batch_serving.py b/vllm/entrypoints/openai/chat_completion/batch_serving.py index 0dfcdd92515..852a26967a0 100644 --- a/vllm/entrypoints/openai/chat_completion/batch_serving.py +++ b/vllm/entrypoints/openai/chat_completion/batch_serving.py @@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.protocol import ( RequestResponseMetadata, UsageInfo, ) -from vllm.entrypoints.utils import get_max_tokens +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens from vllm.inputs import EngineInput from vllm.logger import init_logger from vllm.outputs import RequestOutput diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index a378fb79d3b..9dd9a34162e 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -21,7 +21,6 @@ from vllm.entrypoints.chat_utils import ( get_tool_call_id_type, make_tool_call_id, ) -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -57,8 +56,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( get_streamable_parser_for_assistant, parse_chat_output, ) -from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls -from vllm.entrypoints.utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.request_logger import RequestLogger +from vllm.entrypoints.serve.utils.tool_calls_utils import ( + maybe_filter_parallel_tool_calls, +) from vllm.inputs import EngineInput from vllm.logger import init_logger from vllm.logprobs import Logprob diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d130e83422a..1533895edcd 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -20,11 +20,11 @@ from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, validate_chat_template, ) -from vllm.entrypoints.constants import ( +from vllm.entrypoints.openai.models.protocol import LoRAModulePath +from vllm.entrypoints.serve.utils.constants import ( H11_MAX_HEADER_COUNT_DEFAULT, H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT, ) -from vllm.entrypoints.openai.models.protocol import LoRAModulePath from vllm.logger import init_logger from vllm.tool_parsers import ToolParserManager from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 4d8e0f88583..441aef165c4 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -13,12 +13,12 @@ from vllm.entrypoints.openai.completion.protocol import ( ) from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.orca_metrics import metrics_header -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) +from vllm.entrypoints.serve.utils.orca_metrics import metrics_header from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index f393954e2a0..ed85323d806 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -13,7 +13,6 @@ import pybase64 as base64 from fastapi import Request from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.completion.protocol import ( CompletionLogProbs, CompletionRequest, @@ -34,7 +33,8 @@ from vllm.entrypoints.openai.engine.serving import ( clamp_prompt_logprobs, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.entrypoints.utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.exceptions import VLLMValidationError from vllm.inputs import EngineInput from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 61b2656bac0..f3e07336e82 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -16,7 +16,6 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.generate.beam_search.online import BeamSearchOnlineMixin -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( BatchChatCompletionRequest, ChatCompletionRequest, @@ -39,12 +38,13 @@ from vllm.entrypoints.serve.tokenize.protocol import ( TokenizeCompletionRequest, TokenizeResponse, ) +from vllm.entrypoints.serve.utils.error_response import create_error_response +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.entrypoints.speech_to_text.transcription.protocol import ( TranscriptionRequest, TranscriptionResponse, ) from vllm.entrypoints.speech_to_text.translation.protocol import TranslationRequest -from vllm.entrypoints.utils import create_error_response from vllm.inputs import EngineInput, PromptType from vllm.logger import init_logger from vllm.logprobs import Logprob, PromptLogprobs @@ -153,7 +153,7 @@ class OpenAIServing(BeamSearchOnlineMixin): # Computed once at startup (cached by ``vllm_config`` identity) and # stamped on non-streaming responses. Streaming chunks deliberately # omit it to avoid per-chunk overhead. - from vllm.entrypoints.openai.fingerprint import get_system_fingerprint + from vllm.entrypoints.serve.utils.fingerprint import get_system_fingerprint try: self.system_fingerprint: str | None = get_system_fingerprint( diff --git a/vllm/entrypoints/openai/models/serving.py b/vllm/entrypoints/openai/models/serving.py index 504d30f69d2..ea330678d09 100644 --- a/vllm/entrypoints/openai/models/serving.py +++ b/vllm/entrypoints/openai/models/serving.py @@ -18,7 +18,7 @@ from vllm.entrypoints.serve.lora.protocol import ( LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, ) -from vllm.entrypoints.utils import create_error_response +from vllm.entrypoints.serve.utils.error_response import create_error_response from vllm.exceptions import LoRAAdapterNotFoundError from vllm.logger import init_logger from vllm.lora.request import LoRARequest diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py index 809b601fd21..1a3048b8d4f 100644 --- a/vllm/entrypoints/openai/parser/responses_parser.py +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -12,11 +12,11 @@ from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_output_text import ResponseOutputText from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption -from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.responses.protocol import ( ResponseInputOutputItem, ResponsesRequest, ) +from vllm.entrypoints.serve.utils.constants import MCP_PREFIX from vllm.outputs import CompletionOutput from vllm.parser.abstract_parser import Parser from vllm.tokenizers import TokenizerLike diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 61077f1a7c5..7f83a44e67e 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -15,9 +15,9 @@ from vllm.entrypoints.openai.responses.protocol import ( StreamingResponsesResponse, ) from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/responses/context.py b/vllm/entrypoints/openai/responses/context.py index 62de02ef826..e72032c24aa 100644 --- a/vllm/entrypoints/openai/responses/context.py +++ b/vllm/entrypoints/openai/responses/context.py @@ -20,7 +20,6 @@ from vllm import envs from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ) -from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.mcp.tool import Tool from vllm.entrypoints.mcp.tool_server import ToolServer from vllm.entrypoints.openai.engine.protocol import ( @@ -40,6 +39,7 @@ from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.entrypoints.openai.responses.utils import construct_tool_dicts +from vllm.entrypoints.serve.utils.constants import MCP_PREFIX from vllm.outputs import RequestOutput from vllm.parser.abstract_parser import Parser from vllm.tokenizers import TokenizerLike diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index eee02707a97..112328def21 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -32,7 +32,6 @@ from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, get_tool_call_id_type, ) -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.mcp.tool_server import ToolServer from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, @@ -93,7 +92,8 @@ from vllm.entrypoints.openai.responses.utils import ( extract_tool_types, ) from vllm.entrypoints.serve.render.serving import OpenAIServingRender -from vllm.entrypoints.utils import get_max_tokens +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.exceptions import VLLMValidationError from vllm.inputs import EngineInput, tokens_input from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 327254e3acc..58975b4f86b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -51,6 +51,7 @@ from vllm.entrypoints.pooling.scoring.protocol import ( ScoreRequest, ScoreResponse, ) +from vllm.entrypoints.serve.utils.error_response import create_error_response from vllm.entrypoints.speech_to_text.transcription.protocol import ( TranscriptionRequest, TranscriptionResponse, @@ -61,7 +62,6 @@ from vllm.entrypoints.speech_to_text.translation.protocol import ( TranslationResponse, TranslationResponseVerbose, ) -from vllm.entrypoints.utils import create_error_response from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager diff --git a/vllm/entrypoints/pooling/base/serving.py b/vllm/entrypoints/pooling/base/serving.py index d44d5f7f734..d849baba055 100644 --- a/vllm/entrypoints/pooling/base/serving.py +++ b/vllm/entrypoints/pooling/base/serving.py @@ -16,9 +16,9 @@ from vllm import PoolingParams, PoolingRequestOutput, envs from vllm.config import VllmConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateConfig -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.exceptions import VLLMNotFoundError from vllm.inputs import EngineInput from vllm.lora.request import LoRARequest diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py index 2d27628bc69..9e016a72e84 100644 --- a/vllm/entrypoints/pooling/classify/api_router.py +++ b/vllm/entrypoints/pooling/classify/api_router.py @@ -4,9 +4,9 @@ from fastapi import APIRouter, Depends, Request from fastapi.responses import Response -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index 4eb86e4e2d2..7ffb5840d5b 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -6,8 +6,11 @@ from http import HTTPStatus from fastapi import APIRouter, Depends, Request from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.serve.utils.api_utils import ( + load_aware_call, + validate_json_request, + with_cancellation, +) from .protocol import CohereEmbedRequest, EmbeddingRequest from .serving import ServingEmbedding diff --git a/vllm/entrypoints/pooling/factories.py b/vllm/entrypoints/pooling/factories.py index 62f76a7aa28..dd3d873b311 100644 --- a/vllm/entrypoints/pooling/factories.py +++ b/vllm/entrypoints/pooling/factories.py @@ -21,12 +21,12 @@ if TYPE_CHECKING: from starlette.datastructures import State from vllm.engine.protocol import EngineClient - from vllm.entrypoints.logger import RequestLogger - from vllm.entrypoints.sagemaker.api_router import ( + from vllm.entrypoints.serve.sagemaker.api_router import ( EndpointFn, GetHandlerFn, RequestType, ) + from vllm.entrypoints.serve.utils.request_logger import RequestLogger else: RequestLogger = object diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 0c77c050dc0..653a36f699a 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -5,8 +5,11 @@ from http import HTTPStatus from fastapi import APIRouter, Depends, Request from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.serve.utils.api_utils import ( + load_aware_call, + validate_json_request, + with_cancellation, +) from .protocol import PoolingRequest from .serving import ServingPooling diff --git a/vllm/entrypoints/pooling/scoring/api_router.py b/vllm/entrypoints/pooling/scoring/api_router.py index cf583293eac..f67b5e912f3 100644 --- a/vllm/entrypoints/pooling/scoring/api_router.py +++ b/vllm/entrypoints/pooling/scoring/api_router.py @@ -5,8 +5,11 @@ from http import HTTPStatus from fastapi import APIRouter, Depends, Request from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.serve.utils.api_utils import ( + load_aware_call, + validate_json_request, + with_cancellation, +) from vllm.logger import init_logger from .protocol import RerankRequest, ScoreRequest diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py index e7c18a0914a..7cec4344b3b 100644 --- a/vllm/entrypoints/serve/disagg/api_router.py +++ b/vllm/entrypoints/serve/disagg/api_router.py @@ -13,7 +13,6 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.disagg.protocol import ( GenerateRequest, GenerateResponse, @@ -22,8 +21,9 @@ from vllm.entrypoints.serve.disagg.serving import ( ServingTokens, ) from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, + validate_json_request, with_cancellation, ) from vllm.logger import init_logger diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index 0cc227ee74d..72aeb843773 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -14,7 +14,6 @@ import pybase64 as base64 from fastapi import Request from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -38,7 +37,8 @@ from vllm.entrypoints.serve.disagg.protocol import ( GenerateStreamResponse, ) from vllm.entrypoints.serve.render.serving import OpenAIServingRender -from vllm.entrypoints.utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.inputs import EngineInput, mm_input from vllm.logger import init_logger from vllm.logprobs import Logprob diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py index 00e38b61167..e711a257ddd 100644 --- a/vllm/entrypoints/serve/elastic_ep/api_router.py +++ b/vllm/entrypoints/serve/elastic_ep/api_router.py @@ -12,11 +12,11 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.elastic_ep.middleware import ( get_scaling_elastic_ep, set_scaling_elastic_ep, ) +from vllm.entrypoints.serve.utils.api_utils import validate_json_request from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/entrypoints/serve/lora/api_router.py b/vllm/entrypoints/serve/lora/api_router.py index 39ca0ec91b2..511aeaa07ba 100644 --- a/vllm/entrypoints/serve/lora/api_router.py +++ b/vllm/entrypoints/serve/lora/api_router.py @@ -12,11 +12,11 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.models.api_router import models from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.lora.protocol import ( LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, ) +from vllm.entrypoints.serve.utils.api_utils import validate_json_request from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/entrypoints/serve/render/api_router.py b/vllm/entrypoints/serve/render/api_router.py index d8e6130709f..ac0c1ce67d8 100644 --- a/vllm/entrypoints/serve/render/api_router.py +++ b/vllm/entrypoints/serve/render/api_router.py @@ -8,9 +8,9 @@ from fastapi.responses import JSONResponse from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.entrypoints.openai.completion.protocol import CompletionRequest from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.disagg.protocol import GenerateRequest from vllm.entrypoints.serve.render.serving import OpenAIServingRender +from vllm.entrypoints.serve.utils.api_utils import validate_json_request from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 782b2eaea24..e8e0c254460 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -11,7 +11,6 @@ from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ConversationMessage, ) -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.entrypoints.openai.completion.protocol import CompletionRequest from vllm.entrypoints.openai.engine.protocol import ( @@ -31,10 +30,9 @@ from vllm.entrypoints.serve.disagg.protocol import ( MultiModalFeatures, PlaceholderRangeInfo, ) -from vllm.entrypoints.utils import ( - create_error_response, - get_max_tokens, -) +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens +from vllm.entrypoints.serve.utils.error_response import create_error_response +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.inputs import ( EngineInput, MultiModalHashes, diff --git a/vllm/entrypoints/sagemaker/__init__.py b/vllm/entrypoints/serve/sagemaker/__init__.py similarity index 100% rename from vllm/entrypoints/sagemaker/__init__.py rename to vllm/entrypoints/serve/sagemaker/__init__.py diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/serve/sagemaker/api_router.py similarity index 98% rename from vllm/entrypoints/sagemaker/api_router.py rename to vllm/entrypoints/serve/sagemaker/api_router.py index 00dd7db2818..82c094d161f 100644 --- a/vllm/entrypoints/sagemaker/api_router.py +++ b/vllm/entrypoints/serve/sagemaker/api_router.py @@ -14,11 +14,11 @@ from vllm.config import ModelConfig from vllm.entrypoints.generate.factories import get_generate_invocation_types from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.base.serving import PoolingServingBase from vllm.entrypoints.pooling.factories import get_pooling_invocation_types from vllm.entrypoints.serve.instrumentator.basic import base from vllm.entrypoints.serve.instrumentator.health import health +from vllm.entrypoints.serve.utils.api_utils import validate_json_request from vllm.tasks import SupportedTask # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py index d165b555385..eebb17c6427 100644 --- a/vllm/entrypoints/serve/tokenize/api_router.py +++ b/vllm/entrypoints/serve/tokenize/api_router.py @@ -12,7 +12,6 @@ from typing_extensions import assert_never from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) -from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.serve.tokenize.protocol import ( DetokenizeRequest, DetokenizeResponse, @@ -20,7 +19,8 @@ from vllm.entrypoints.serve.tokenize.protocol import ( TokenizeResponse, ) from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( + validate_json_request, with_cancellation, ) from vllm.logger import init_logger diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 9b573b69eb8..4f461c0194e 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -7,7 +7,6 @@ from fastapi import Request from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels @@ -20,6 +19,7 @@ from vllm.entrypoints.serve.tokenize.protocol import ( TokenizeResponse, TokenizerInfoResponse, ) +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.inputs import TokensPrompt, tokens_input from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike diff --git a/vllm/entrypoints/serve/utils/__init__.py b/vllm/entrypoints/serve/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/serve/utils/api_utils.py similarity index 84% rename from vllm/entrypoints/utils.py rename to vllm/entrypoints/serve/utils/api_utils.py index 8ec41098ad2..15de1b0690d 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/serve/utils/api_utils.py @@ -6,24 +6,19 @@ import dataclasses import functools import os from argparse import Namespace -from http import HTTPStatus from logging import Logger from string import Template from typing import Any import regex as re from fastapi import Request +from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks from vllm import envs from vllm.engine.arg_utils import EngineArgs -from vllm.entrypoints.openai.engine.protocol import ( - ErrorInfo, - ErrorResponse, - GenerationError, - StreamOptions, -) +from vllm.entrypoints.openai.engine.protocol import StreamOptions from vllm.entrypoints.openai.models.protocol import LoRAModulePath from vllm.logger import current_formatter_type, init_logger from vllm.platforms import current_platform @@ -279,7 +274,7 @@ def log_non_default_args(args: Namespace | EngineArgs): def should_include_usage( - stream_options: "StreamOptions | None", enable_force_include_usage: bool + stream_options: StreamOptions | None, enable_force_include_usage: bool ) -> tuple[bool, bool]: if enable_force_include_usage: return True, True @@ -344,60 +339,10 @@ def log_version_and_model(lgr: Logger, version: str, model_name: str) -> None: lgr.info(message, version, model_name) -def create_error_response( - message: str | Exception, - err_type: str = "BadRequestError", - status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, - param: str | None = None, -) -> ErrorResponse: - exc: Exception | None = None - - if isinstance(message, Exception): - exc = message - logger.debug( - "create_error_response called with %s: %s", type(exc).__name__, exc +async def validate_json_request(raw_request: Request): + content_type = raw_request.headers.get("content-type", "").lower() + media_type = content_type.split(";", maxsplit=1)[0] + if media_type != "application/json": + raise RequestValidationError( + errors=["Unsupported Media Type: Only 'application/json' is allowed"] ) - - from vllm.exceptions import VLLMNotFoundError, VLLMValidationError - - if isinstance(exc, VLLMValidationError): - err_type = "BadRequestError" - status_code = HTTPStatus.BAD_REQUEST - param = exc.parameter - elif isinstance(exc, VLLMNotFoundError): - err_type = "NotFoundError" - status_code = HTTPStatus.NOT_FOUND - param = None - elif isinstance(exc, (ValueError, TypeError, OverflowError)): - # Common validation errors from user input - err_type = "BadRequestError" - status_code = HTTPStatus.BAD_REQUEST - param = None - elif isinstance(exc, NotImplementedError): - err_type = "NotImplementedError" - status_code = HTTPStatus.NOT_IMPLEMENTED - param = None - elif isinstance(exc, GenerationError): - err_type = "InternalServerError" - status_code = exc.status_code - param = None - elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__): - # jinja2.TemplateError and its subclasses (avoid importing jinja2) - err_type = "BadRequestError" - status_code = HTTPStatus.BAD_REQUEST - param = None - else: - err_type = "InternalServerError" - status_code = HTTPStatus.INTERNAL_SERVER_ERROR - param = None - - message = str(exc) - - return ErrorResponse( - error=ErrorInfo( - message=sanitize_message(message), - type=err_type, - code=status_code.value, - param=param, - ) - ) diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/serve/utils/constants.py similarity index 100% rename from vllm/entrypoints/constants.py rename to vllm/entrypoints/serve/utils/constants.py diff --git a/vllm/entrypoints/serve/utils/error_response.py b/vllm/entrypoints/serve/utils/error_response.py new file mode 100644 index 00000000000..4dea1513a42 --- /dev/null +++ b/vllm/entrypoints/serve/utils/error_response.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from http import HTTPStatus + +from vllm.entrypoints.openai.engine.protocol import ( + ErrorInfo, + ErrorResponse, + GenerationError, +) +from vllm.entrypoints.serve.utils.api_utils import sanitize_message +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def create_error_response( + message: str | Exception, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, +) -> ErrorResponse: + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + logger.debug( + "create_error_response called with %s: %s", type(exc).__name__, exc + ) + + from vllm.exceptions import VLLMNotFoundError, VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, VLLMNotFoundError): + err_type = "NotFoundError" + status_code = HTTPStatus.NOT_FOUND + param = None + elif isinstance(exc, (ValueError, TypeError, OverflowError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif isinstance(exc, NotImplementedError): + err_type = "NotImplementedError" + status_code = HTTPStatus.NOT_IMPLEMENTED + param = None + elif isinstance(exc, GenerationError): + err_type = "InternalServerError" + status_code = exc.status_code + param = None + elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__): + # jinja2.TemplateError and its subclasses (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + + return ErrorResponse( + error=ErrorInfo( + message=sanitize_message(message), + type=err_type, + code=status_code.value, + param=param, + ) + ) diff --git a/vllm/entrypoints/openai/fingerprint.py b/vllm/entrypoints/serve/utils/fingerprint.py similarity index 100% rename from vllm/entrypoints/openai/fingerprint.py rename to vllm/entrypoints/serve/utils/fingerprint.py diff --git a/vllm/entrypoints/openai/orca_metrics.py b/vllm/entrypoints/serve/utils/orca_metrics.py similarity index 100% rename from vllm/entrypoints/openai/orca_metrics.py rename to vllm/entrypoints/serve/utils/orca_metrics.py diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/serve/utils/request_logger.py similarity index 100% rename from vllm/entrypoints/logger.py rename to vllm/entrypoints/serve/utils/request_logger.py diff --git a/vllm/entrypoints/openai/server_utils.py b/vllm/entrypoints/serve/utils/server_utils.py similarity index 99% rename from vllm/entrypoints/openai/server_utils.py rename to vllm/entrypoints/serve/utils/server_utils.py index 269c33549e8..3b6dfde447e 100644 --- a/vllm/entrypoints/openai/server_utils.py +++ b/vllm/entrypoints/serve/utils/server_utils.py @@ -26,7 +26,10 @@ from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, GenerationError, ) -from vllm.entrypoints.utils import create_error_response, sanitize_message +from vllm.entrypoints.serve.utils.error_response import ( + create_error_response, + sanitize_message, +) from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger from vllm.utils.gc_utils import freeze_gc_heap diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/serve/utils/ssl.py similarity index 100% rename from vllm/entrypoints/ssl.py rename to vllm/entrypoints/serve/utils/ssl.py diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/serve/utils/tool_calls_utils.py similarity index 73% rename from vllm/entrypoints/openai/utils.py rename to vllm/entrypoints/serve/utils/tool_calls_utils.py index 55e59510f54..648698c2a97 100644 --- a/vllm/entrypoints/openai/utils.py +++ b/vllm/entrypoints/serve/utils/tool_calls_utils.py @@ -2,9 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TypeVar -from fastapi import Request -from fastapi.exceptions import RequestValidationError - from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponseChoice, @@ -38,12 +35,3 @@ def maybe_filter_parallel_tool_calls( ] return choice - - -async def validate_json_request(raw_request: Request): - content_type = raw_request.headers.get("content-type", "").lower() - media_type = content_type.split(";", maxsplit=1)[0] - if media_type != "application/json": - raise RequestValidationError( - errors=["Unsupported Media Type: Only 'application/json' is allowed"] - ) diff --git a/vllm/entrypoints/speech_to_text/base/serving.py b/vllm/entrypoints/speech_to_text/base/serving.py index a0f02a2c783..06d266fd1f1 100644 --- a/vllm/entrypoints/speech_to_text/base/serving.py +++ b/vllm/entrypoints/speech_to_text/base/serving.py @@ -15,7 +15,6 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ErrorResponse, @@ -24,7 +23,8 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.entrypoints.utils import get_max_tokens +from vllm.entrypoints.serve.utils.api_utils import get_max_tokens +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.exceptions import VLLMValidationError from vllm.inputs import EncoderDecoderInput, EngineInput from vllm.logger import init_logger diff --git a/vllm/entrypoints/speech_to_text/factories.py b/vllm/entrypoints/speech_to_text/factories.py index 3625f6d2a8d..1971e32b989 100644 --- a/vllm/entrypoints/speech_to_text/factories.py +++ b/vllm/entrypoints/speech_to_text/factories.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from starlette.datastructures import State from vllm.engine.protocol import EngineClient - from vllm.entrypoints.logger import RequestLogger + from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.tasks import SupportedTask else: RequestLogger = object diff --git a/vllm/entrypoints/speech_to_text/realtime/serving.py b/vllm/entrypoints/speech_to_text/realtime/serving.py index 710d1907a16..e5b5e951279 100644 --- a/vllm/entrypoints/speech_to_text/realtime/serving.py +++ b/vllm/entrypoints/speech_to_text/realtime/serving.py @@ -9,9 +9,9 @@ from typing import Literal, cast import numpy as np from vllm.engine.protocol import EngineClient, StreamingInput -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.model_executor.models.interfaces import SupportsRealtime diff --git a/vllm/entrypoints/speech_to_text/transcription/api_router.py b/vllm/entrypoints/speech_to_text/transcription/api_router.py index c4de6810ca6..b676e22b109 100644 --- a/vllm/entrypoints/speech_to_text/transcription/api_router.py +++ b/vllm/entrypoints/speech_to_text/transcription/api_router.py @@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request from fastapi.responses import JSONResponse, StreamingResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, with_cancellation, ) diff --git a/vllm/entrypoints/speech_to_text/transcription/serving.py b/vllm/entrypoints/speech_to_text/transcription/serving.py index 123c4c234ec..0d5a3c9edbf 100644 --- a/vllm/entrypoints/speech_to_text/transcription/serving.py +++ b/vllm/entrypoints/speech_to_text/transcription/serving.py @@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator from fastapi import Request from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, RequestResponseMetadata, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.logger import init_logger from vllm.outputs import RequestOutput diff --git a/vllm/entrypoints/speech_to_text/translation/api_router.py b/vllm/entrypoints/speech_to_text/translation/api_router.py index a68b098834b..e846fbc05fb 100644 --- a/vllm/entrypoints/speech_to_text/translation/api_router.py +++ b/vllm/entrypoints/speech_to_text/translation/api_router.py @@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request from fastapi.responses import JSONResponse, StreamingResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.utils import ( +from vllm.entrypoints.serve.utils.api_utils import ( load_aware_call, with_cancellation, ) diff --git a/vllm/entrypoints/speech_to_text/translation/serving.py b/vllm/entrypoints/speech_to_text/translation/serving.py index 257f8f74396..a3951250f12 100644 --- a/vllm/entrypoints/speech_to_text/translation/serving.py +++ b/vllm/entrypoints/speech_to_text/translation/serving.py @@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator from fastapi import Request from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, RequestResponseMetadata, ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.entrypoints.serve.utils.request_logger import RequestLogger from vllm.logger import init_logger from vllm.outputs import RequestOutput diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index efbf2daf398..f11c92a805d 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -359,7 +359,7 @@ class RustFrontendProcessManager: ] if stats_update_address is not None: cmd.extend(["--coordinator-address", stats_update_address]) - from vllm.entrypoints.utils import jsonify_non_default_args + from vllm.entrypoints.serve.utils.api_utils import jsonify_non_default_args args_json = json.dumps( jsonify_non_default_args(args, exclude={"api_server_count"}),