[Frontend] Consolidate online serving utils. (#44479)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-06-04 14:49:31 +08:00
committed by GitHub
parent b4b4aaa70e
commit d01d0b4646
81 changed files with 466 additions and 435 deletions
+4 -5
View File
@@ -1299,12 +1299,11 @@ steps:
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py
- pytest -v -s entrypoints/offline_mode
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (Pooling) # TBD
timeout_in_minutes: 180
@@ -1346,7 +1345,7 @@ steps:
- vllm/platforms/rocm.py
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
- label: OpenAI API correctness # TBD
timeout_in_minutes: 180
+3 -4
View File
@@ -11,7 +11,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
- label: Entrypoints Integration (LLM)
key: entrypoints-integration-llm
@@ -20,12 +20,11 @@ steps:
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests
mirror:
amd:
device: mi325_1
+2 -1
View File
@@ -34,10 +34,11 @@
/vllm/entrypoints/speech_to_text/realtime @njhill
/vllm/entrypoints/speech_to_text @NickLucche
/vllm/entrypoints/pooling @noooop
/vllm/entrypoints/sagemaker @DarkLight1337
/vllm/entrypoints/serve/sagemaker @DarkLight1337
/vllm/entrypoints/serve @njhill
/vllm/entrypoints/*.py @njhill
/vllm/entrypoints/chat_utils.py @DarkLight1337
/vllm/entrypoints/offline_utils.py @DarkLight1337
/vllm/entrypoints/llm.py @DarkLight1337
# Rust Frontend
@@ -6,7 +6,7 @@
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# Model name constants used across tests
MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
@@ -22,7 +22,8 @@ import tempfile
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import (
MODEL_NAME_SMOLLM,
)
@@ -4,7 +4,8 @@ import openai # use the official async_client for correctness check
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import MODEL_NAME_SMOLLM
@@ -12,7 +12,8 @@ import tempfile
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import (
MODEL_NAME_SMOLLM,
)
@@ -6,7 +6,8 @@ import openai # use the official client for correctness check
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import (
HEADER_SAGEMAKER_CLOSED_SESSION_ID,
HEADER_SAGEMAKER_NEW_SESSION_ID,
@@ -4,7 +4,7 @@
import pytest
from vllm.entrypoints.openai.engine.protocol import StreamOptions
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
get_max_tokens,
sanitize_message,
should_include_usage,
@@ -6,7 +6,7 @@ from types import SimpleNamespace
import pytest
from vllm.entrypoints.openai import fingerprint as fp
from vllm.entrypoints.serve.utils import fingerprint as fp
def _cfg(tp=1, pp=1, dp=1, ep=False, digest="a3b21f94deadbeef"):
@@ -0,0 +1,248 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
def test_request_logger_log_outputs():
"""Test the new log_outputs functionality."""
# Create a mock logger to capture log calls
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test basic output logging
request_logger.log_outputs(
request_id="test-123",
outputs="Hello, world!",
output_token_ids=[1, 2, 3, 4],
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-123"
assert call_args[3] == "Hello, world!"
assert call_args[4] == [1, 2, 3, 4]
assert call_args[5] == "stop"
def test_request_logger_log_outputs_streaming_delta():
"""Test log_outputs with streaming delta mode."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test streaming delta logging
request_logger.log_outputs(
request_id="test-456",
outputs="Hello",
output_token_ids=[1],
finish_reason=None,
is_streaming=True,
delta=True,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-456"
assert call_args[2] == " (streaming delta)"
assert call_args[3] == "Hello"
assert call_args[4] == [1]
assert call_args[5] is None
def test_request_logger_log_outputs_streaming_complete():
"""Test log_outputs with streaming complete mode."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test streaming complete logging
request_logger.log_outputs(
request_id="test-789",
outputs="Complete response",
output_token_ids=[1, 2, 3],
finish_reason="length",
is_streaming=True,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-789"
assert call_args[2] == " (streaming complete)"
assert call_args[3] == "Complete response"
assert call_args[4] == [1, 2, 3]
assert call_args[5] == "length"
def test_request_logger_log_outputs_with_truncation():
"""Test log_outputs respects max_log_len setting."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
# Set max_log_len to 10
request_logger = RequestLogger(max_log_len=10)
# Test output truncation
long_output = "This is a very long output that should be truncated"
long_token_ids = list(range(20)) # 20 tokens
request_logger.log_outputs(
request_id="test-truncate",
outputs=long_output,
output_token_ids=long_token_ids,
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args
# Check that output was truncated to first 10 characters
logged_output = call_args[0][3]
assert logged_output == "This is a "
assert len(logged_output) == 10
# Check that token IDs were truncated to first 10 tokens
logged_token_ids = call_args[0][4]
assert logged_token_ids == list(range(10))
assert len(logged_token_ids) == 10
def test_request_logger_log_outputs_none_values():
"""Test log_outputs handles None values correctly."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test with None output_token_ids
request_logger.log_outputs(
request_id="test-none",
outputs="Test output",
output_token_ids=None,
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-none"
assert call_args[3] == "Test output"
assert call_args[4] is None
assert call_args[5] == "stop"
def test_request_logger_log_outputs_empty_output():
"""Test log_outputs handles empty output correctly."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=5)
# Test with empty output
request_logger.log_outputs(
request_id="test-empty",
outputs="",
output_token_ids=[],
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-empty"
assert call_args[3] == ""
assert call_args[4] == []
assert call_args[5] == "stop"
def test_request_logger_log_outputs_integration():
"""Test that log_outputs can be called alongside log_inputs."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test that both methods can be called without interference
request_logger.log_inputs(
request_id="test-integration",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_embeds=None,
params=None,
lora_request=None,
)
request_logger.log_outputs(
request_id="test-integration",
outputs="Test output",
output_token_ids=[4, 5, 6],
finish_reason="stop",
is_streaming=False,
delta=False,
)
# Should have been called twice - once for inputs, once for outputs
assert mock_logger.info.call_count == 2
# Check that the calls were made with correct patterns
input_call = mock_logger.info.call_args_list[0][0]
output_call = mock_logger.info.call_args_list[1][0]
assert "Received request %s" in input_call[0]
assert input_call[1] == "test-integration"
assert "Generated response %s%s" in output_call[0]
assert output_call[1] == "test-integration"
def test_streaming_complete_logs_full_text_content():
"""Test that streaming complete logging includes
full accumulated text, not just token count."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test with actual content instead of token count format
full_response = "This is a complete response from streaming"
request_logger.log_outputs(
request_id="test-streaming-full-text",
outputs=full_response,
output_token_ids=None,
finish_reason="streaming_complete",
is_streaming=True,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
# Verify the logged output is the full text, not a token count format
logged_output = call_args[3]
assert logged_output == full_response
assert "tokens>" not in logged_output
assert "streaming_complete" not in logged_output
# Verify other parameters
assert call_args[1] == "test-streaming-full-text"
assert call_args[2] == " (streaming complete)"
assert call_args[5] == "streaming_complete"
@@ -7,7 +7,7 @@ from ssl import SSLContext
import pytest
from vllm.entrypoints.ssl import SSLCertRefresher
from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher
class MockSSLContext(SSLContext):
+1 -244
View File
@@ -10,12 +10,11 @@ from dataclasses import dataclass
from json.decoder import JSONDecodeError
from tempfile import NamedTemporaryFile
from typing import Any
from unittest.mock import MagicMock, patch
from unittest.mock import patch
from uuid import uuid4
import pytest
from vllm.entrypoints.logger import RequestLogger
from vllm.logger import (
_DATE_FORMAT,
_FORMAT,
@@ -269,248 +268,6 @@ def test_prepare_object_to_dump():
assert prepare_object_to_dump(CustomClass(1, "b")) == "CustomClass(a=1, b='b')"
def test_request_logger_log_outputs():
"""Test the new log_outputs functionality."""
# Create a mock logger to capture log calls
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test basic output logging
request_logger.log_outputs(
request_id="test-123",
outputs="Hello, world!",
output_token_ids=[1, 2, 3, 4],
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-123"
assert call_args[3] == "Hello, world!"
assert call_args[4] == [1, 2, 3, 4]
assert call_args[5] == "stop"
def test_request_logger_log_outputs_streaming_delta():
"""Test log_outputs with streaming delta mode."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test streaming delta logging
request_logger.log_outputs(
request_id="test-456",
outputs="Hello",
output_token_ids=[1],
finish_reason=None,
is_streaming=True,
delta=True,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-456"
assert call_args[2] == " (streaming delta)"
assert call_args[3] == "Hello"
assert call_args[4] == [1]
assert call_args[5] is None
def test_request_logger_log_outputs_streaming_complete():
"""Test log_outputs with streaming complete mode."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test streaming complete logging
request_logger.log_outputs(
request_id="test-789",
outputs="Complete response",
output_token_ids=[1, 2, 3],
finish_reason="length",
is_streaming=True,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-789"
assert call_args[2] == " (streaming complete)"
assert call_args[3] == "Complete response"
assert call_args[4] == [1, 2, 3]
assert call_args[5] == "length"
def test_request_logger_log_outputs_with_truncation():
"""Test log_outputs respects max_log_len setting."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
# Set max_log_len to 10
request_logger = RequestLogger(max_log_len=10)
# Test output truncation
long_output = "This is a very long output that should be truncated"
long_token_ids = list(range(20)) # 20 tokens
request_logger.log_outputs(
request_id="test-truncate",
outputs=long_output,
output_token_ids=long_token_ids,
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args
# Check that output was truncated to first 10 characters
logged_output = call_args[0][3]
assert logged_output == "This is a "
assert len(logged_output) == 10
# Check that token IDs were truncated to first 10 tokens
logged_token_ids = call_args[0][4]
assert logged_token_ids == list(range(10))
assert len(logged_token_ids) == 10
def test_request_logger_log_outputs_none_values():
"""Test log_outputs handles None values correctly."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test with None output_token_ids
request_logger.log_outputs(
request_id="test-none",
outputs="Test output",
output_token_ids=None,
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-none"
assert call_args[3] == "Test output"
assert call_args[4] is None
assert call_args[5] == "stop"
def test_request_logger_log_outputs_empty_output():
"""Test log_outputs handles empty output correctly."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=5)
# Test with empty output
request_logger.log_outputs(
request_id="test-empty",
outputs="",
output_token_ids=[],
finish_reason="stop",
is_streaming=False,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-empty"
assert call_args[3] == ""
assert call_args[4] == []
assert call_args[5] == "stop"
def test_request_logger_log_outputs_integration():
"""Test that log_outputs can be called alongside log_inputs."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test that both methods can be called without interference
request_logger.log_inputs(
request_id="test-integration",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_embeds=None,
params=None,
lora_request=None,
)
request_logger.log_outputs(
request_id="test-integration",
outputs="Test output",
output_token_ids=[4, 5, 6],
finish_reason="stop",
is_streaming=False,
delta=False,
)
# Should have been called twice - once for inputs, once for outputs
assert mock_logger.info.call_count == 2
# Check that the calls were made with correct patterns
input_call = mock_logger.info.call_args_list[0][0]
output_call = mock_logger.info.call_args_list[1][0]
assert "Received request %s" in input_call[0]
assert input_call[1] == "test-integration"
assert "Generated response %s%s" in output_call[0]
assert output_call[1] == "test-integration"
def test_streaming_complete_logs_full_text_content():
"""Test that streaming complete logging includes
full accumulated text, not just token count."""
mock_logger = MagicMock()
with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)
# Test with actual content instead of token count format
full_response = "This is a complete response from streaming"
request_logger.log_outputs(
request_id="test-streaming-full-text",
outputs=full_response,
output_token_ids=None,
finish_reason="streaming_complete",
is_streaming=True,
delta=False,
)
mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
# Verify the logged output is the full text, not a token count format
logged_output = call_args[3]
assert logged_output == full_response
assert "tokens>" not in logged_output
assert "streaming_complete" not in logged_output
# Verify other parameters
assert call_args[1] == "test-streaming-full-text"
assert call_args[2] == " (streaming complete)"
assert call_args[5] == "streaming_complete"
# Add vllm prefix to make sure logs go through the vllm logger
test_logger = init_logger("vllm.test_logger")
+1 -1
View File
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
from .plot import SweepPlotArgs
from .plot import main as plot_main
+2 -2
View File
@@ -17,9 +17,9 @@ from vllm.entrypoints.anthropic.protocol import (
)
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
+1 -1
View File
@@ -29,7 +29,6 @@ from vllm.entrypoints.anthropic.protocol import (
AnthropicUsage,
)
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
@@ -45,6 +44,7 @@ from vllm.entrypoints.openai.engine.protocol import (
StreamOptions,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
if TYPE_CHECKING:
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+1 -1
View File
@@ -22,7 +22,7 @@ import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.utils import with_cancellation
from vllm.entrypoints.serve.utils.api_utils import with_cancellation
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
+1 -1
View File
@@ -7,7 +7,7 @@ import typing
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
from vllm.entrypoints.cli.types import CLISubcommand
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
if typing.TYPE_CHECKING:
from vllm.utils.argparse_utils import FlexibleArgumentParser
+1 -1
View File
@@ -18,7 +18,7 @@ from vllm.entrypoints.openai.cli_args import (
make_arg_parser,
validate_parsed_serve_args,
)
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
from vllm.utils.argparse_utils import FlexibleArgumentParser
+4 -1
View File
@@ -21,7 +21,10 @@ def main():
import vllm.entrypoints.cli.openai
import vllm.entrypoints.cli.run_batch
import vllm.entrypoints.cli.serve
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
from vllm.entrypoints.serve.utils.api_utils import (
VLLM_SUBCMD_PARSER_EPILOG,
cli_env_setup,
)
from vllm.utils.argparse_utils import FlexibleArgumentParser
CMD_MODULES = [
+1 -1
View File
@@ -7,7 +7,7 @@ import importlib.metadata
import typing
from vllm.entrypoints.cli.types import CLISubcommand
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
if typing.TYPE_CHECKING:
+1 -1
View File
@@ -15,7 +15,7 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_se
from vllm.entrypoints.openai.dp_supervisor import (
run_dp_supervisor,
)
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
+2 -2
View File
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
from starlette.datastructures import State
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.tasks import SupportedTask
else:
RequestLogger = object
@@ -65,9 +65,9 @@ async def init_generate_state(
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.fingerprint import set_default_fingerprint_mode
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.serve.disagg.serving import ServingTokens
from vllm.entrypoints.serve.utils.fingerprint import set_default_fingerprint_mode
# Applied before any serving class is constructed so that each one picks
# up the chosen mode on its first cache miss.
+1 -1
View File
@@ -6,7 +6,7 @@ from vllm.config import ModelConfig
from vllm.tasks import SupportedTask
if TYPE_CHECKING:
from vllm.entrypoints.sagemaker.api_router import (
from vllm.entrypoints.serve.sagemaker.api_router import (
EndpointFn,
GetHandlerFn,
RequestType,
@@ -10,8 +10,11 @@ from vllm.entrypoints.generate.generative_scoring.serving import (
ServingGenerativeScoring,
)
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
router = APIRouter()
@@ -18,7 +18,6 @@ from fastapi import Request
from pydantic import Field
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
OpenAIBaseModel,
@@ -26,6 +25,7 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.inputs import EngineInput, tokens_input
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
+1 -1
View File
@@ -43,7 +43,7 @@ import uvloop
from vllm import envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.utils import log_version_and_model
from vllm.entrypoints.serve.utils.api_utils import log_version_and_model
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
+2 -2
View File
@@ -12,11 +12,11 @@ from fastapi import FastAPI
from vllm import envs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.constants import (
from vllm.entrypoints.serve.utils.constants import (
H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
)
from vllm.entrypoints.ssl import SSLCertRefresher
from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher
from vllm.logger import init_logger
from vllm.utils.network_utils import find_process_using_port
+1 -1
View File
@@ -40,7 +40,7 @@ from vllm.entrypoints.chat_utils import (
)
from vllm.entrypoints.generate.beam_search.offline import BeamSearchOfflineMixin
from vllm.entrypoints.pooling.offline import PoolingOfflineMixin
from vllm.entrypoints.utils import log_non_default_args
from vllm.entrypoints.serve.utils.api_utils import log_non_default_args
from vllm.inputs import PromptType
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
+15 -15
View File
@@ -27,12 +27,22 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.server_utils import (
from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.serve.sagemaker.api_router import sagemaker_standards_bootstrap
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.entrypoints.serve.utils.api_utils import (
cli_env_setup,
log_non_default_args,
log_version_and_model,
process_lora_modules,
)
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.entrypoints.serve.utils.server_utils import (
engine_error_handler,
exception_handler,
generation_error_handler,
@@ -42,16 +52,6 @@ from vllm.entrypoints.openai.server_utils import (
log_response,
validation_exception_handler,
)
from vllm.entrypoints.sagemaker.api_router import sagemaker_standards_bootstrap
from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.entrypoints.utils import (
cli_env_setup,
log_non_default_args,
log_version_and_model,
process_lora_modules,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tasks import POOLING_TASKS, SupportedTask
@@ -187,7 +187,7 @@ def build_app(
register_models_api_router(app)
from vllm.entrypoints.sagemaker.api_router import (
from vllm.entrypoints.serve.sagemaker.api_router import (
attach_router as register_sagemaker_api_router,
)
@@ -254,12 +254,12 @@ def build_app(
# Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
from vllm.entrypoints.openai.server_utils import AuthenticationMiddleware
from vllm.entrypoints.serve.utils.server_utils import AuthenticationMiddleware
app.add_middleware(AuthenticationMiddleware, tokens=tokens)
if args.enable_request_id_headers:
from vllm.entrypoints.openai.server_utils import XRequestIdMiddleware
from vllm.entrypoints.serve.utils.server_utils import XRequestIdMiddleware
app.add_middleware(XRequestIdMiddleware)
@@ -15,12 +15,12 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.entrypoints.serve.utils.orca_metrics import metrics_header
from vllm.logger import init_logger
logger = init_logger(__name__)
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.protocol import (
RequestResponseMetadata,
UsageInfo,
)
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
from vllm.inputs import EngineInput
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
@@ -21,7 +21,6 @@ from vllm.entrypoints.chat_utils import (
get_tool_call_id_type,
make_tool_call_id,
)
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionLogProb,
ChatCompletionLogProbs,
@@ -57,8 +56,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
get_streamable_parser_for_assistant,
parse_chat_output,
)
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.entrypoints.serve.utils.tool_calls_utils import (
maybe_filter_parallel_tool_calls,
)
from vllm.inputs import EngineInput
from vllm.logger import init_logger
from vllm.logprobs import Logprob
+2 -2
View File
@@ -20,11 +20,11 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
validate_chat_template,
)
from vllm.entrypoints.constants import (
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
from vllm.entrypoints.serve.utils.constants import (
H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
)
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
from vllm.logger import init_logger
from vllm.tool_parsers import ToolParserManager
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -13,12 +13,12 @@ from vllm.entrypoints.openai.completion.protocol import (
)
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.entrypoints.serve.utils.orca_metrics import metrics_header
from vllm.logger import init_logger
logger = init_logger(__name__)
@@ -13,7 +13,6 @@ import pybase64 as base64
from fastapi import Request
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.completion.protocol import (
CompletionLogProbs,
CompletionRequest,
@@ -34,7 +33,8 @@ from vllm.entrypoints.openai.engine.serving import (
clamp_prompt_logprobs,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.exceptions import VLLMValidationError
from vllm.inputs import EngineInput
from vllm.logger import init_logger
+3 -3
View File
@@ -16,7 +16,6 @@ from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.generate.beam_search.online import BeamSearchOnlineMixin
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.protocol import (
BatchChatCompletionRequest,
ChatCompletionRequest,
@@ -39,12 +38,13 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeCompletionRequest,
TokenizeResponse,
)
from vllm.entrypoints.serve.utils.error_response import create_error_response
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.entrypoints.speech_to_text.transcription.protocol import (
TranscriptionRequest,
TranscriptionResponse,
)
from vllm.entrypoints.speech_to_text.translation.protocol import TranslationRequest
from vllm.entrypoints.utils import create_error_response
from vllm.inputs import EngineInput, PromptType
from vllm.logger import init_logger
from vllm.logprobs import Logprob, PromptLogprobs
@@ -153,7 +153,7 @@ class OpenAIServing(BeamSearchOnlineMixin):
# Computed once at startup (cached by ``vllm_config`` identity) and
# stamped on non-streaming responses. Streaming chunks deliberately
# omit it to avoid per-chunk overhead.
from vllm.entrypoints.openai.fingerprint import get_system_fingerprint
from vllm.entrypoints.serve.utils.fingerprint import get_system_fingerprint
try:
self.system_fingerprint: str | None = get_system_fingerprint(
+1 -1
View File
@@ -18,7 +18,7 @@ from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.utils import create_error_response
from vllm.entrypoints.serve.utils.error_response import create_error_response
from vllm.exceptions import LoRAAdapterNotFoundError
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
@@ -12,11 +12,11 @@ from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_output_text import ResponseOutputText
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem,
ResponsesRequest,
)
from vllm.entrypoints.serve.utils.constants import MCP_PREFIX
from vllm.outputs import CompletionOutput
from vllm.parser.abstract_parser import Parser
from vllm.tokenizers import TokenizerLike
@@ -15,9 +15,9 @@ from vllm.entrypoints.openai.responses.protocol import (
StreamingResponsesResponse,
)
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
+1 -1
View File
@@ -20,7 +20,6 @@ from vllm import envs
from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
)
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.mcp.tool import Tool
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.entrypoints.openai.engine.protocol import (
@@ -40,6 +39,7 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.entrypoints.openai.responses.utils import construct_tool_dicts
from vllm.entrypoints.serve.utils.constants import MCP_PREFIX
from vllm.outputs import RequestOutput
from vllm.parser.abstract_parser import Parser
from vllm.tokenizers import TokenizerLike
+2 -2
View File
@@ -32,7 +32,6 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
get_tool_call_id_type,
)
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
@@ -93,7 +92,8 @@ from vllm.entrypoints.openai.responses.utils import (
extract_tool_types,
)
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.exceptions import VLLMValidationError
from vllm.inputs import EngineInput, tokens_input
from vllm.logger import init_logger
+1 -1
View File
@@ -51,6 +51,7 @@ from vllm.entrypoints.pooling.scoring.protocol import (
ScoreRequest,
ScoreResponse,
)
from vllm.entrypoints.serve.utils.error_response import create_error_response
from vllm.entrypoints.speech_to_text.transcription.protocol import (
TranscriptionRequest,
TranscriptionResponse,
@@ -61,7 +62,6 @@ from vllm.entrypoints.speech_to_text.translation.protocol import (
TranslationResponse,
TranslationResponseVerbose,
)
from vllm.entrypoints.utils import create_error_response
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
+1 -1
View File
@@ -16,9 +16,9 @@ from vllm import PoolingParams, PoolingRequestOutput, envs
from vllm.config import VllmConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateConfig
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.exceptions import VLLMNotFoundError
from vllm.inputs import EngineInput
from vllm.lora.request import LoRARequest
@@ -4,9 +4,9 @@
from fastapi import APIRouter, Depends, Request
from fastapi.responses import Response
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
+5 -2
View File
@@ -6,8 +6,11 @@ from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from .protocol import CohereEmbedRequest, EmbeddingRequest
from .serving import ServingEmbedding
+2 -2
View File
@@ -21,12 +21,12 @@ if TYPE_CHECKING:
from starlette.datastructures import State
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.sagemaker.api_router import (
from vllm.entrypoints.serve.sagemaker.api_router import (
EndpointFn,
GetHandlerFn,
RequestType,
)
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
else:
RequestLogger = object
@@ -5,8 +5,11 @@ from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from .protocol import PoolingRequest
from .serving import ServingPooling
@@ -5,8 +5,11 @@ from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
from .protocol import RerankRequest, ScoreRequest
+2 -2
View File
@@ -13,7 +13,6 @@ from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest,
GenerateResponse,
@@ -22,8 +21,9 @@ from vllm.entrypoints.serve.disagg.serving import (
ServingTokens,
)
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
+2 -2
View File
@@ -14,7 +14,6 @@ import pybase64 as base64
from fastapi import Request
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionLogProb,
ChatCompletionLogProbs,
@@ -38,7 +37,8 @@ from vllm.entrypoints.serve.disagg.protocol import (
GenerateStreamResponse,
)
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.inputs import EngineInput, mm_input
from vllm.logger import init_logger
from vllm.logprobs import Logprob
@@ -12,11 +12,11 @@ from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.elastic_ep.middleware import (
get_scaling_elastic_ep,
set_scaling_elastic_ep,
)
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
from vllm.logger import init_logger
logger = init_logger(__name__)
+1 -1
View File
@@ -12,11 +12,11 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from vllm.entrypoints.openai.models.api_router import models
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
from vllm.logger import init_logger
logger = init_logger(__name__)
+1 -1
View File
@@ -8,9 +8,9 @@ from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
from vllm.logger import init_logger
logger = init_logger(__name__)
+3 -5
View File
@@ -11,7 +11,6 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
ConversationMessage,
)
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.engine.protocol import (
@@ -31,10 +30,9 @@ from vllm.entrypoints.serve.disagg.protocol import (
MultiModalFeatures,
PlaceholderRangeInfo,
)
from vllm.entrypoints.utils import (
create_error_response,
get_max_tokens,
)
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
from vllm.entrypoints.serve.utils.error_response import create_error_response
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.inputs import (
EngineInput,
MultiModalHashes,
@@ -14,11 +14,11 @@ from vllm.config import ModelConfig
from vllm.entrypoints.generate.factories import get_generate_invocation_types
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.base.serving import PoolingServingBase
from vllm.entrypoints.pooling.factories import get_pooling_invocation_types
from vllm.entrypoints.serve.instrumentator.basic import base
from vllm.entrypoints.serve.instrumentator.health import health
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
from vllm.tasks import SupportedTask
# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
@@ -12,7 +12,6 @@ from typing_extensions import assert_never
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.tokenize.protocol import (
DetokenizeRequest,
DetokenizeResponse,
@@ -20,7 +19,8 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse,
)
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
validate_json_request,
with_cancellation,
)
from vllm.logger import init_logger
+1 -1
View File
@@ -7,7 +7,6 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
@@ -20,6 +19,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse,
TokenizerInfoResponse,
)
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.inputs import TokensPrompt, tokens_input
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
@@ -6,24 +6,19 @@ import dataclasses
import functools
import os
from argparse import Namespace
from http import HTTPStatus
from logging import Logger
from string import Template
from typing import Any
import regex as re
from fastapi import Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.background import BackgroundTask, BackgroundTasks
from vllm import envs
from vllm.engine.arg_utils import EngineArgs
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
GenerationError,
StreamOptions,
)
from vllm.entrypoints.openai.engine.protocol import StreamOptions
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
from vllm.logger import current_formatter_type, init_logger
from vllm.platforms import current_platform
@@ -279,7 +274,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
def should_include_usage(
stream_options: "StreamOptions | None", enable_force_include_usage: bool
stream_options: StreamOptions | None, enable_force_include_usage: bool
) -> tuple[bool, bool]:
if enable_force_include_usage:
return True, True
@@ -344,60 +339,10 @@ def log_version_and_model(lgr: Logger, version: str, model_name: str) -> None:
lgr.info(message, version, model_name)
def create_error_response(
message: str | Exception,
err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
param: str | None = None,
) -> ErrorResponse:
exc: Exception | None = None
if isinstance(message, Exception):
exc = message
logger.debug(
"create_error_response called with %s: %s", type(exc).__name__, exc
async def validate_json_request(raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
media_type = content_type.split(";", maxsplit=1)[0]
if media_type != "application/json":
raise RequestValidationError(
errors=["Unsupported Media Type: Only 'application/json' is allowed"]
)
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, VLLMNotFoundError):
err_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
param = None
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif isinstance(exc, GenerationError):
err_type = "InternalServerError"
status_code = exc.status_code
param = None
elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__):
# jinja2.TemplateError and its subclasses (avoid importing jinja2)
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
else:
err_type = "InternalServerError"
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
param = None
message = str(exc)
return ErrorResponse(
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
param=param,
)
)
@@ -0,0 +1,72 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
GenerationError,
)
from vllm.entrypoints.serve.utils.api_utils import sanitize_message
from vllm.logger import init_logger
logger = init_logger(__name__)
def create_error_response(
message: str | Exception,
err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
param: str | None = None,
) -> ErrorResponse:
exc: Exception | None = None
if isinstance(message, Exception):
exc = message
logger.debug(
"create_error_response called with %s: %s", type(exc).__name__, exc
)
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, VLLMNotFoundError):
err_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
param = None
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif isinstance(exc, GenerationError):
err_type = "InternalServerError"
status_code = exc.status_code
param = None
elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__):
# jinja2.TemplateError and its subclasses (avoid importing jinja2)
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
else:
err_type = "InternalServerError"
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
param = None
message = str(exc)
return ErrorResponse(
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
param=param,
)
)
@@ -26,7 +26,10 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
GenerationError,
)
from vllm.entrypoints.utils import create_error_response, sanitize_message
from vllm.entrypoints.serve.utils.error_response import (
create_error_response,
sanitize_message,
)
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.utils.gc_utils import freeze_gc_heap
@@ -2,9 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TypeVar
from fastapi import Request
from fastapi.exceptions import RequestValidationError
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponseChoice,
@@ -38,12 +35,3 @@ def maybe_filter_parallel_tool_calls(
]
return choice
async def validate_json_request(raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
media_type = content_type.split(";", maxsplit=1)[0]
if media_type != "application/json":
raise RequestValidationError(
errors=["Unsupported Media Type: Only 'application/json' is allowed"]
)
@@ -15,7 +15,6 @@ from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
ErrorResponse,
@@ -24,7 +23,8 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.exceptions import VLLMValidationError
from vllm.inputs import EncoderDecoderInput, EngineInput
from vllm.logger import init_logger
+1 -1
View File
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
from starlette.datastructures import State
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.tasks import SupportedTask
else:
RequestLogger = object
@@ -9,9 +9,9 @@ from typing import Literal, cast
import numpy as np
from vllm.engine.protocol import EngineClient, StreamingInput
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.inputs import PromptType
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import SupportsRealtime
@@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
with_cancellation,
)
@@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator
from fastapi import Request
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
@@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.utils import (
from vllm.entrypoints.serve.utils.api_utils import (
load_aware_call,
with_cancellation,
)
@@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator
from fastapi import Request
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
+1 -1
View File
@@ -359,7 +359,7 @@ class RustFrontendProcessManager:
]
if stats_update_address is not None:
cmd.extend(["--coordinator-address", stats_update_address])
from vllm.entrypoints.utils import jsonify_non_default_args
from vllm.entrypoints.serve.utils.api_utils import jsonify_non_default_args
args_json = json.dumps(
jsonify_non_default_args(args, exclude={"api_server_count"}),