mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Frontend] Consolidate online serving utils. (#44479)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
@@ -1299,12 +1299,11 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py
|
||||
- pytest -v -s entrypoints/offline_mode
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration (Pooling) # TBD
|
||||
timeout_in_minutes: 180
|
||||
@@ -1346,7 +1345,7 @@ steps:
|
||||
- vllm/platforms/rocm.py
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
|
||||
|
||||
- label: OpenAI API correctness # TBD
|
||||
timeout_in_minutes: 180
|
||||
|
||||
@@ -11,7 +11,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
key: entrypoints-integration-llm
|
||||
@@ -20,12 +20,11 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py --ignore=entrypoints/llm/offline_mode
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- pytest -v -s entrypoints/llm/offline_mode # Needs to avoid interference with other tests
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
|
||||
+2
-1
@@ -34,10 +34,11 @@
|
||||
/vllm/entrypoints/speech_to_text/realtime @njhill
|
||||
/vllm/entrypoints/speech_to_text @NickLucche
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/sagemaker @DarkLight1337
|
||||
/vllm/entrypoints/serve/sagemaker @DarkLight1337
|
||||
/vllm/entrypoints/serve @njhill
|
||||
/vllm/entrypoints/*.py @njhill
|
||||
/vllm/entrypoints/chat_utils.py @DarkLight1337
|
||||
/vllm/entrypoints/offline_utils.py @DarkLight1337
|
||||
/vllm/entrypoints/llm.py @DarkLight1337
|
||||
|
||||
# Rust Frontend
|
||||
|
||||
+1
-1
@@ -6,7 +6,7 @@
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
# Model name constants used across tests
|
||||
MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
|
||||
+2
-1
@@ -22,7 +22,8 @@ import tempfile
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
from .conftest import (
|
||||
MODEL_NAME_SMOLLM,
|
||||
)
|
||||
+2
-1
@@ -4,7 +4,8 @@ import openai # use the official async_client for correctness check
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
from .conftest import MODEL_NAME_SMOLLM
|
||||
|
||||
|
||||
+2
-1
@@ -12,7 +12,8 @@ import tempfile
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
from .conftest import (
|
||||
MODEL_NAME_SMOLLM,
|
||||
)
|
||||
+2
-1
@@ -6,7 +6,8 @@ import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
from .conftest import (
|
||||
HEADER_SAGEMAKER_CLOSED_SESSION_ID,
|
||||
HEADER_SAGEMAKER_NEW_SESSION_ID,
|
||||
@@ -4,7 +4,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import StreamOptions
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
get_max_tokens,
|
||||
sanitize_message,
|
||||
should_include_usage,
|
||||
+1
-1
@@ -6,7 +6,7 @@ from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai import fingerprint as fp
|
||||
from vllm.entrypoints.serve.utils import fingerprint as fp
|
||||
|
||||
|
||||
def _cfg(tp=1, pp=1, dp=1, ep=False, digest="a3b21f94deadbeef"):
|
||||
@@ -0,0 +1,248 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
|
||||
|
||||
def test_request_logger_log_outputs():
|
||||
"""Test the new log_outputs functionality."""
|
||||
# Create a mock logger to capture log calls
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test basic output logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-123",
|
||||
outputs="Hello, world!",
|
||||
output_token_ids=[1, 2, 3, 4],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-123"
|
||||
assert call_args[3] == "Hello, world!"
|
||||
assert call_args[4] == [1, 2, 3, 4]
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_streaming_delta():
|
||||
"""Test log_outputs with streaming delta mode."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test streaming delta logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-456",
|
||||
outputs="Hello",
|
||||
output_token_ids=[1],
|
||||
finish_reason=None,
|
||||
is_streaming=True,
|
||||
delta=True,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-456"
|
||||
assert call_args[2] == " (streaming delta)"
|
||||
assert call_args[3] == "Hello"
|
||||
assert call_args[4] == [1]
|
||||
assert call_args[5] is None
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_streaming_complete():
|
||||
"""Test log_outputs with streaming complete mode."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test streaming complete logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-789",
|
||||
outputs="Complete response",
|
||||
output_token_ids=[1, 2, 3],
|
||||
finish_reason="length",
|
||||
is_streaming=True,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-789"
|
||||
assert call_args[2] == " (streaming complete)"
|
||||
assert call_args[3] == "Complete response"
|
||||
assert call_args[4] == [1, 2, 3]
|
||||
assert call_args[5] == "length"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_with_truncation():
|
||||
"""Test log_outputs respects max_log_len setting."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
# Set max_log_len to 10
|
||||
request_logger = RequestLogger(max_log_len=10)
|
||||
|
||||
# Test output truncation
|
||||
long_output = "This is a very long output that should be truncated"
|
||||
long_token_ids = list(range(20)) # 20 tokens
|
||||
|
||||
request_logger.log_outputs(
|
||||
request_id="test-truncate",
|
||||
outputs=long_output,
|
||||
output_token_ids=long_token_ids,
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args
|
||||
|
||||
# Check that output was truncated to first 10 characters
|
||||
logged_output = call_args[0][3]
|
||||
assert logged_output == "This is a "
|
||||
assert len(logged_output) == 10
|
||||
|
||||
# Check that token IDs were truncated to first 10 tokens
|
||||
logged_token_ids = call_args[0][4]
|
||||
assert logged_token_ids == list(range(10))
|
||||
assert len(logged_token_ids) == 10
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_none_values():
|
||||
"""Test log_outputs handles None values correctly."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test with None output_token_ids
|
||||
request_logger.log_outputs(
|
||||
request_id="test-none",
|
||||
outputs="Test output",
|
||||
output_token_ids=None,
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-none"
|
||||
assert call_args[3] == "Test output"
|
||||
assert call_args[4] is None
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_empty_output():
|
||||
"""Test log_outputs handles empty output correctly."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=5)
|
||||
|
||||
# Test with empty output
|
||||
request_logger.log_outputs(
|
||||
request_id="test-empty",
|
||||
outputs="",
|
||||
output_token_ids=[],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-empty"
|
||||
assert call_args[3] == ""
|
||||
assert call_args[4] == []
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_integration():
|
||||
"""Test that log_outputs can be called alongside log_inputs."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test that both methods can be called without interference
|
||||
request_logger.log_inputs(
|
||||
request_id="test-integration",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_embeds=None,
|
||||
params=None,
|
||||
lora_request=None,
|
||||
)
|
||||
|
||||
request_logger.log_outputs(
|
||||
request_id="test-integration",
|
||||
outputs="Test output",
|
||||
output_token_ids=[4, 5, 6],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
# Should have been called twice - once for inputs, once for outputs
|
||||
assert mock_logger.info.call_count == 2
|
||||
|
||||
# Check that the calls were made with correct patterns
|
||||
input_call = mock_logger.info.call_args_list[0][0]
|
||||
output_call = mock_logger.info.call_args_list[1][0]
|
||||
|
||||
assert "Received request %s" in input_call[0]
|
||||
assert input_call[1] == "test-integration"
|
||||
|
||||
assert "Generated response %s%s" in output_call[0]
|
||||
assert output_call[1] == "test-integration"
|
||||
|
||||
|
||||
def test_streaming_complete_logs_full_text_content():
|
||||
"""Test that streaming complete logging includes
|
||||
full accumulated text, not just token count."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.serve.utils.request_logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test with actual content instead of token count format
|
||||
full_response = "This is a complete response from streaming"
|
||||
request_logger.log_outputs(
|
||||
request_id="test-streaming-full-text",
|
||||
outputs=full_response,
|
||||
output_token_ids=None,
|
||||
finish_reason="streaming_complete",
|
||||
is_streaming=True,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
|
||||
# Verify the logged output is the full text, not a token count format
|
||||
logged_output = call_args[3]
|
||||
assert logged_output == full_response
|
||||
assert "tokens>" not in logged_output
|
||||
assert "streaming_complete" not in logged_output
|
||||
|
||||
# Verify other parameters
|
||||
assert call_args[1] == "test-streaming-full-text"
|
||||
assert call_args[2] == " (streaming complete)"
|
||||
assert call_args[5] == "streaming_complete"
|
||||
+1
-1
@@ -7,7 +7,7 @@ from ssl import SSLContext
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.ssl import SSLCertRefresher
|
||||
from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher
|
||||
|
||||
|
||||
class MockSSLContext(SSLContext):
|
||||
+1
-244
@@ -10,12 +10,11 @@ from dataclasses import dataclass
|
||||
from json.decoder import JSONDecodeError
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.logger import (
|
||||
_DATE_FORMAT,
|
||||
_FORMAT,
|
||||
@@ -269,248 +268,6 @@ def test_prepare_object_to_dump():
|
||||
assert prepare_object_to_dump(CustomClass(1, "b")) == "CustomClass(a=1, b='b')"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs():
|
||||
"""Test the new log_outputs functionality."""
|
||||
# Create a mock logger to capture log calls
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test basic output logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-123",
|
||||
outputs="Hello, world!",
|
||||
output_token_ids=[1, 2, 3, 4],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-123"
|
||||
assert call_args[3] == "Hello, world!"
|
||||
assert call_args[4] == [1, 2, 3, 4]
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_streaming_delta():
|
||||
"""Test log_outputs with streaming delta mode."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test streaming delta logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-456",
|
||||
outputs="Hello",
|
||||
output_token_ids=[1],
|
||||
finish_reason=None,
|
||||
is_streaming=True,
|
||||
delta=True,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-456"
|
||||
assert call_args[2] == " (streaming delta)"
|
||||
assert call_args[3] == "Hello"
|
||||
assert call_args[4] == [1]
|
||||
assert call_args[5] is None
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_streaming_complete():
|
||||
"""Test log_outputs with streaming complete mode."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test streaming complete logging
|
||||
request_logger.log_outputs(
|
||||
request_id="test-789",
|
||||
outputs="Complete response",
|
||||
output_token_ids=[1, 2, 3],
|
||||
finish_reason="length",
|
||||
is_streaming=True,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-789"
|
||||
assert call_args[2] == " (streaming complete)"
|
||||
assert call_args[3] == "Complete response"
|
||||
assert call_args[4] == [1, 2, 3]
|
||||
assert call_args[5] == "length"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_with_truncation():
|
||||
"""Test log_outputs respects max_log_len setting."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
# Set max_log_len to 10
|
||||
request_logger = RequestLogger(max_log_len=10)
|
||||
|
||||
# Test output truncation
|
||||
long_output = "This is a very long output that should be truncated"
|
||||
long_token_ids = list(range(20)) # 20 tokens
|
||||
|
||||
request_logger.log_outputs(
|
||||
request_id="test-truncate",
|
||||
outputs=long_output,
|
||||
output_token_ids=long_token_ids,
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args
|
||||
|
||||
# Check that output was truncated to first 10 characters
|
||||
logged_output = call_args[0][3]
|
||||
assert logged_output == "This is a "
|
||||
assert len(logged_output) == 10
|
||||
|
||||
# Check that token IDs were truncated to first 10 tokens
|
||||
logged_token_ids = call_args[0][4]
|
||||
assert logged_token_ids == list(range(10))
|
||||
assert len(logged_token_ids) == 10
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_none_values():
|
||||
"""Test log_outputs handles None values correctly."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test with None output_token_ids
|
||||
request_logger.log_outputs(
|
||||
request_id="test-none",
|
||||
outputs="Test output",
|
||||
output_token_ids=None,
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-none"
|
||||
assert call_args[3] == "Test output"
|
||||
assert call_args[4] is None
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_empty_output():
|
||||
"""Test log_outputs handles empty output correctly."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=5)
|
||||
|
||||
# Test with empty output
|
||||
request_logger.log_outputs(
|
||||
request_id="test-empty",
|
||||
outputs="",
|
||||
output_token_ids=[],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
assert "Generated response %s%s" in call_args[0]
|
||||
assert call_args[1] == "test-empty"
|
||||
assert call_args[3] == ""
|
||||
assert call_args[4] == []
|
||||
assert call_args[5] == "stop"
|
||||
|
||||
|
||||
def test_request_logger_log_outputs_integration():
|
||||
"""Test that log_outputs can be called alongside log_inputs."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test that both methods can be called without interference
|
||||
request_logger.log_inputs(
|
||||
request_id="test-integration",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_embeds=None,
|
||||
params=None,
|
||||
lora_request=None,
|
||||
)
|
||||
|
||||
request_logger.log_outputs(
|
||||
request_id="test-integration",
|
||||
outputs="Test output",
|
||||
output_token_ids=[4, 5, 6],
|
||||
finish_reason="stop",
|
||||
is_streaming=False,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
# Should have been called twice - once for inputs, once for outputs
|
||||
assert mock_logger.info.call_count == 2
|
||||
|
||||
# Check that the calls were made with correct patterns
|
||||
input_call = mock_logger.info.call_args_list[0][0]
|
||||
output_call = mock_logger.info.call_args_list[1][0]
|
||||
|
||||
assert "Received request %s" in input_call[0]
|
||||
assert input_call[1] == "test-integration"
|
||||
|
||||
assert "Generated response %s%s" in output_call[0]
|
||||
assert output_call[1] == "test-integration"
|
||||
|
||||
|
||||
def test_streaming_complete_logs_full_text_content():
|
||||
"""Test that streaming complete logging includes
|
||||
full accumulated text, not just token count."""
|
||||
mock_logger = MagicMock()
|
||||
|
||||
with patch("vllm.entrypoints.logger.logger", mock_logger):
|
||||
request_logger = RequestLogger(max_log_len=None)
|
||||
|
||||
# Test with actual content instead of token count format
|
||||
full_response = "This is a complete response from streaming"
|
||||
request_logger.log_outputs(
|
||||
request_id="test-streaming-full-text",
|
||||
outputs=full_response,
|
||||
output_token_ids=None,
|
||||
finish_reason="streaming_complete",
|
||||
is_streaming=True,
|
||||
delta=False,
|
||||
)
|
||||
|
||||
mock_logger.info.assert_called_once()
|
||||
call_args = mock_logger.info.call_args.args
|
||||
|
||||
# Verify the logged output is the full text, not a token count format
|
||||
logged_output = call_args[3]
|
||||
assert logged_output == full_response
|
||||
assert "tokens>" not in logged_output
|
||||
assert "streaming_complete" not in logged_output
|
||||
|
||||
# Verify other parameters
|
||||
assert call_args[1] == "test-streaming-full-text"
|
||||
assert call_args[2] == " (streaming complete)"
|
||||
assert call_args[5] == "streaming_complete"
|
||||
|
||||
|
||||
# Add vllm prefix to make sure logs go through the vllm logger
|
||||
test_logger = init_logger("vllm.test_logger")
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
|
||||
from .plot import SweepPlotArgs
|
||||
from .plot import main as plot_main
|
||||
|
||||
@@ -17,9 +17,9 @@ from vllm.entrypoints.anthropic.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -29,7 +29,6 @@ from vllm.entrypoints.anthropic.protocol import (
|
||||
AnthropicUsage,
|
||||
)
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionNamedToolChoiceParam,
|
||||
ChatCompletionRequest,
|
||||
@@ -45,6 +44,7 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
@@ -22,7 +22,7 @@ import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
from vllm.entrypoints.utils import with_cancellation
|
||||
from vllm.entrypoints.serve.utils.api_utils import with_cancellation
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
@@ -7,7 +7,7 @@ import typing
|
||||
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
@@ -18,7 +18,7 @@ from vllm.entrypoints.openai.cli_args import (
|
||||
make_arg_parser,
|
||||
validate_parsed_serve_args,
|
||||
)
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@@ -21,7 +21,10 @@ def main():
|
||||
import vllm.entrypoints.cli.openai
|
||||
import vllm.entrypoints.cli.run_batch
|
||||
import vllm.entrypoints.cli.serve
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
VLLM_SUBCMD_PARSER_EPILOG,
|
||||
cli_env_setup,
|
||||
)
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
CMD_MODULES = [
|
||||
|
||||
@@ -7,7 +7,7 @@ import importlib.metadata
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
|
||||
@@ -15,7 +15,7 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_se
|
||||
from vllm.entrypoints.openai.dp_supervisor import (
|
||||
run_dp_supervisor,
|
||||
)
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.entrypoints.serve.utils.api_utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
||||
from starlette.datastructures import State
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.tasks import SupportedTask
|
||||
else:
|
||||
RequestLogger = object
|
||||
@@ -65,9 +65,9 @@ async def init_generate_state(
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.fingerprint import set_default_fingerprint_mode
|
||||
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
|
||||
from vllm.entrypoints.serve.disagg.serving import ServingTokens
|
||||
from vllm.entrypoints.serve.utils.fingerprint import set_default_fingerprint_mode
|
||||
|
||||
# Applied before any serving class is constructed so that each one picks
|
||||
# up the chosen mode on its first cache miss.
|
||||
|
||||
@@ -6,7 +6,7 @@ from vllm.config import ModelConfig
|
||||
from vllm.tasks import SupportedTask
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.sagemaker.api_router import (
|
||||
from vllm.entrypoints.serve.sagemaker.api_router import (
|
||||
EndpointFn,
|
||||
GetHandlerFn,
|
||||
RequestType,
|
||||
|
||||
@@ -10,8 +10,11 @@ from vllm.entrypoints.generate.generative_scoring.serving import (
|
||||
ServingGenerativeScoring,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -18,7 +18,6 @@ from fastapi import Request
|
||||
from pydantic import Field
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
OpenAIBaseModel,
|
||||
@@ -26,6 +25,7 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.inputs import EngineInput, tokens_input
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
@@ -43,7 +43,7 @@ import uvloop
|
||||
|
||||
from vllm import envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.utils import log_version_and_model
|
||||
from vllm.entrypoints.serve.utils.api_utils import log_version_and_model
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
@@ -12,11 +12,11 @@ from fastapi import FastAPI
|
||||
|
||||
from vllm import envs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.constants import (
|
||||
from vllm.entrypoints.serve.utils.constants import (
|
||||
H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
|
||||
)
|
||||
from vllm.entrypoints.ssl import SSLCertRefresher
|
||||
from vllm.entrypoints.serve.utils.ssl import SSLCertRefresher
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.network_utils import find_process_using_port
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ from vllm.entrypoints.chat_utils import (
|
||||
)
|
||||
from vllm.entrypoints.generate.beam_search.offline import BeamSearchOfflineMixin
|
||||
from vllm.entrypoints.pooling.offline import PoolingOfflineMixin
|
||||
from vllm.entrypoints.utils import log_non_default_args
|
||||
from vllm.entrypoints.serve.utils.api_utils import log_non_default_args
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@@ -27,12 +27,22 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
from vllm.entrypoints.openai.engine.protocol import GenerationError
|
||||
from vllm.entrypoints.openai.models.protocol import BaseModelPath
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.server_utils import (
|
||||
from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.serve.sagemaker.api_router import sagemaker_standards_bootstrap
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
cli_env_setup,
|
||||
log_non_default_args,
|
||||
log_version_and_model,
|
||||
process_lora_modules,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.entrypoints.serve.utils.server_utils import (
|
||||
engine_error_handler,
|
||||
exception_handler,
|
||||
generation_error_handler,
|
||||
@@ -42,16 +52,6 @@ from vllm.entrypoints.openai.server_utils import (
|
||||
log_response,
|
||||
validation_exception_handler,
|
||||
)
|
||||
from vllm.entrypoints.sagemaker.api_router import sagemaker_standards_bootstrap
|
||||
from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
cli_env_setup,
|
||||
log_non_default_args,
|
||||
log_version_and_model,
|
||||
process_lora_modules,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.tasks import POOLING_TASKS, SupportedTask
|
||||
@@ -187,7 +187,7 @@ def build_app(
|
||||
|
||||
register_models_api_router(app)
|
||||
|
||||
from vllm.entrypoints.sagemaker.api_router import (
|
||||
from vllm.entrypoints.serve.sagemaker.api_router import (
|
||||
attach_router as register_sagemaker_api_router,
|
||||
)
|
||||
|
||||
@@ -254,12 +254,12 @@ def build_app(
|
||||
|
||||
# Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
|
||||
if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]:
|
||||
from vllm.entrypoints.openai.server_utils import AuthenticationMiddleware
|
||||
from vllm.entrypoints.serve.utils.server_utils import AuthenticationMiddleware
|
||||
|
||||
app.add_middleware(AuthenticationMiddleware, tokens=tokens)
|
||||
|
||||
if args.enable_request_id_headers:
|
||||
from vllm.entrypoints.openai.server_utils import XRequestIdMiddleware
|
||||
from vllm.entrypoints.serve.utils.server_utils import XRequestIdMiddleware
|
||||
|
||||
app.add_middleware(XRequestIdMiddleware)
|
||||
|
||||
|
||||
@@ -15,12 +15,12 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.orca_metrics import metrics_header
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.orca_metrics import metrics_header
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
RequestResponseMetadata,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
|
||||
from vllm.inputs import EngineInput
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
@@ -21,7 +21,6 @@ from vllm.entrypoints.chat_utils import (
|
||||
get_tool_call_id_type,
|
||||
make_tool_call_id,
|
||||
)
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionLogProb,
|
||||
ChatCompletionLogProbs,
|
||||
@@ -57,8 +56,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
|
||||
get_streamable_parser_for_assistant,
|
||||
parse_chat_output,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.entrypoints.serve.utils.tool_calls_utils import (
|
||||
maybe_filter_parallel_tool_calls,
|
||||
)
|
||||
from vllm.inputs import EngineInput
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
|
||||
@@ -20,11 +20,11 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
validate_chat_template,
|
||||
)
|
||||
from vllm.entrypoints.constants import (
|
||||
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
|
||||
from vllm.entrypoints.serve.utils.constants import (
|
||||
H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tool_parsers import ToolParserManager
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
@@ -13,12 +13,12 @@ from vllm.entrypoints.openai.completion.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.orca_metrics import metrics_header
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.orca_metrics import metrics_header
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -13,7 +13,6 @@ import pybase64 as base64
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.completion.protocol import (
|
||||
CompletionLogProbs,
|
||||
CompletionRequest,
|
||||
@@ -34,7 +33,8 @@ from vllm.entrypoints.openai.engine.serving import (
|
||||
clamp_prompt_logprobs,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.inputs import EngineInput
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -16,7 +16,6 @@ from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.generate.beam_search.online import BeamSearchOnlineMixin
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
BatchChatCompletionRequest,
|
||||
ChatCompletionRequest,
|
||||
@@ -39,12 +38,13 @@ from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
TokenizeCompletionRequest,
|
||||
TokenizeResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.error_response import create_error_response
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.entrypoints.speech_to_text.transcription.protocol import (
|
||||
TranscriptionRequest,
|
||||
TranscriptionResponse,
|
||||
)
|
||||
from vllm.entrypoints.speech_to_text.translation.protocol import TranslationRequest
|
||||
from vllm.entrypoints.utils import create_error_response
|
||||
from vllm.inputs import EngineInput, PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob, PromptLogprobs
|
||||
@@ -153,7 +153,7 @@ class OpenAIServing(BeamSearchOnlineMixin):
|
||||
# Computed once at startup (cached by ``vllm_config`` identity) and
|
||||
# stamped on non-streaming responses. Streaming chunks deliberately
|
||||
# omit it to avoid per-chunk overhead.
|
||||
from vllm.entrypoints.openai.fingerprint import get_system_fingerprint
|
||||
from vllm.entrypoints.serve.utils.fingerprint import get_system_fingerprint
|
||||
|
||||
try:
|
||||
self.system_fingerprint: str | None = get_system_fingerprint(
|
||||
|
||||
@@ -18,7 +18,7 @@ from vllm.entrypoints.serve.lora.protocol import (
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.utils import create_error_response
|
||||
from vllm.entrypoints.serve.utils.error_response import create_error_response
|
||||
from vllm.exceptions import LoRAAdapterNotFoundError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@@ -12,11 +12,11 @@ from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_output_text import ResponseOutputText
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.responses.protocol import (
|
||||
ResponseInputOutputItem,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.constants import MCP_PREFIX
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.parser.abstract_parser import Parser
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
@@ -15,9 +15,9 @@ from vllm.entrypoints.openai.responses.protocol import (
|
||||
StreamingResponsesResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -20,7 +20,6 @@ from vllm import envs
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.mcp.tool import Tool
|
||||
from vllm.entrypoints.mcp.tool_server import ToolServer
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
@@ -40,6 +39,7 @@ from vllm.entrypoints.openai.responses.protocol import (
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.entrypoints.openai.responses.utils import construct_tool_dicts
|
||||
from vllm.entrypoints.serve.utils.constants import MCP_PREFIX
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.parser.abstract_parser import Parser
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
@@ -32,7 +32,6 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
get_tool_call_id_type,
|
||||
)
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.mcp.tool_server import ToolServer
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaMessage,
|
||||
@@ -93,7 +92,8 @@ from vllm.entrypoints.openai.responses.utils import (
|
||||
extract_tool_types,
|
||||
)
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.inputs import EngineInput, tokens_input
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -51,6 +51,7 @@ from vllm.entrypoints.pooling.scoring.protocol import (
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.error_response import create_error_response
|
||||
from vllm.entrypoints.speech_to_text.transcription.protocol import (
|
||||
TranscriptionRequest,
|
||||
TranscriptionResponse,
|
||||
@@ -61,7 +62,6 @@ from vllm.entrypoints.speech_to_text.translation.protocol import (
|
||||
TranslationResponse,
|
||||
TranslationResponseVerbose,
|
||||
)
|
||||
from vllm.entrypoints.utils import create_error_response
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
|
||||
@@ -16,9 +16,9 @@ from vllm import PoolingParams, PoolingRequestOutput, envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateConfig
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.exceptions import VLLMNotFoundError
|
||||
from vllm.inputs import EngineInput
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import Response
|
||||
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
|
||||
|
||||
@@ -6,8 +6,11 @@ from http import HTTPStatus
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
|
||||
from .protocol import CohereEmbedRequest, EmbeddingRequest
|
||||
from .serving import ServingEmbedding
|
||||
|
||||
@@ -21,12 +21,12 @@ if TYPE_CHECKING:
|
||||
from starlette.datastructures import State
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.sagemaker.api_router import (
|
||||
from vllm.entrypoints.serve.sagemaker.api_router import (
|
||||
EndpointFn,
|
||||
GetHandlerFn,
|
||||
RequestType,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
|
||||
else:
|
||||
RequestLogger = object
|
||||
|
||||
@@ -5,8 +5,11 @@ from http import HTTPStatus
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
|
||||
from .protocol import PoolingRequest
|
||||
from .serving import ServingPooling
|
||||
|
||||
@@ -5,8 +5,11 @@ from http import HTTPStatus
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .protocol import RerankRequest, ScoreRequest
|
||||
|
||||
@@ -13,7 +13,6 @@ from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.disagg.protocol import (
|
||||
GenerateRequest,
|
||||
GenerateResponse,
|
||||
@@ -22,8 +21,9 @@ from vllm.entrypoints.serve.disagg.serving import (
|
||||
ServingTokens,
|
||||
)
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -14,7 +14,6 @@ import pybase64 as base64
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionLogProb,
|
||||
ChatCompletionLogProbs,
|
||||
@@ -38,7 +37,8 @@ from vllm.entrypoints.serve.disagg.protocol import (
|
||||
GenerateStreamResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens, should_include_usage
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.inputs import EngineInput, mm_input
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
|
||||
@@ -12,11 +12,11 @@ from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.elastic_ep.middleware import (
|
||||
get_scaling_elastic_ep,
|
||||
set_scaling_elastic_ep,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -12,11 +12,11 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.models.api_router import models
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.lora.protocol import (
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -8,9 +8,9 @@ from fastapi.responses import JSONResponse
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -11,7 +11,6 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
ConversationMessage,
|
||||
)
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
@@ -31,10 +30,9 @@ from vllm.entrypoints.serve.disagg.protocol import (
|
||||
MultiModalFeatures,
|
||||
PlaceholderRangeInfo,
|
||||
)
|
||||
from vllm.entrypoints.utils import (
|
||||
create_error_response,
|
||||
get_max_tokens,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.error_response import create_error_response
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.inputs import (
|
||||
EngineInput,
|
||||
MultiModalHashes,
|
||||
|
||||
+1
-1
@@ -14,11 +14,11 @@ from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.generate.factories import get_generate_invocation_types
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.base.serving import PoolingServingBase
|
||||
from vllm.entrypoints.pooling.factories import get_pooling_invocation_types
|
||||
from vllm.entrypoints.serve.instrumentator.basic import base
|
||||
from vllm.entrypoints.serve.instrumentator.health import health
|
||||
from vllm.entrypoints.serve.utils.api_utils import validate_json_request
|
||||
from vllm.tasks import SupportedTask
|
||||
|
||||
# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
|
||||
@@ -12,7 +12,6 @@ from typing_extensions import assert_never
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
DetokenizeRequest,
|
||||
DetokenizeResponse,
|
||||
@@ -20,7 +19,8 @@ from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
TokenizeResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
validate_json_request,
|
||||
with_cancellation,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -7,7 +7,6 @@ from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
@@ -20,6 +19,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
|
||||
TokenizeResponse,
|
||||
TokenizerInfoResponse,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.inputs import TokensPrompt, tokens_input
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
@@ -6,24 +6,19 @@ import dataclasses
|
||||
import functools
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from http import HTTPStatus
|
||||
from logging import Logger
|
||||
from string import Template
|
||||
from typing import Any
|
||||
|
||||
import regex as re
|
||||
from fastapi import Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from starlette.background import BackgroundTask, BackgroundTasks
|
||||
|
||||
from vllm import envs
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
GenerationError,
|
||||
StreamOptions,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import StreamOptions
|
||||
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
|
||||
from vllm.logger import current_formatter_type, init_logger
|
||||
from vllm.platforms import current_platform
|
||||
@@ -279,7 +274,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
|
||||
|
||||
|
||||
def should_include_usage(
|
||||
stream_options: "StreamOptions | None", enable_force_include_usage: bool
|
||||
stream_options: StreamOptions | None, enable_force_include_usage: bool
|
||||
) -> tuple[bool, bool]:
|
||||
if enable_force_include_usage:
|
||||
return True, True
|
||||
@@ -344,60 +339,10 @@ def log_version_and_model(lgr: Logger, version: str, model_name: str) -> None:
|
||||
lgr.info(message, version, model_name)
|
||||
|
||||
|
||||
def create_error_response(
|
||||
message: str | Exception,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||
param: str | None = None,
|
||||
) -> ErrorResponse:
|
||||
exc: Exception | None = None
|
||||
|
||||
if isinstance(message, Exception):
|
||||
exc = message
|
||||
logger.debug(
|
||||
"create_error_response called with %s: %s", type(exc).__name__, exc
|
||||
async def validate_json_request(raw_request: Request):
|
||||
content_type = raw_request.headers.get("content-type", "").lower()
|
||||
media_type = content_type.split(";", maxsplit=1)[0]
|
||||
if media_type != "application/json":
|
||||
raise RequestValidationError(
|
||||
errors=["Unsupported Media Type: Only 'application/json' is allowed"]
|
||||
)
|
||||
|
||||
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
|
||||
|
||||
if isinstance(exc, VLLMValidationError):
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = exc.parameter
|
||||
elif isinstance(exc, VLLMNotFoundError):
|
||||
err_type = "NotFoundError"
|
||||
status_code = HTTPStatus.NOT_FOUND
|
||||
param = None
|
||||
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
|
||||
# Common validation errors from user input
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = None
|
||||
elif isinstance(exc, NotImplementedError):
|
||||
err_type = "NotImplementedError"
|
||||
status_code = HTTPStatus.NOT_IMPLEMENTED
|
||||
param = None
|
||||
elif isinstance(exc, GenerationError):
|
||||
err_type = "InternalServerError"
|
||||
status_code = exc.status_code
|
||||
param = None
|
||||
elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__):
|
||||
# jinja2.TemplateError and its subclasses (avoid importing jinja2)
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = None
|
||||
else:
|
||||
err_type = "InternalServerError"
|
||||
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
param = None
|
||||
|
||||
message = str(exc)
|
||||
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(
|
||||
message=sanitize_message(message),
|
||||
type=err_type,
|
||||
code=status_code.value,
|
||||
param=param,
|
||||
)
|
||||
)
|
||||
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from http import HTTPStatus
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
GenerationError,
|
||||
)
|
||||
from vllm.entrypoints.serve.utils.api_utils import sanitize_message
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def create_error_response(
|
||||
message: str | Exception,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||
param: str | None = None,
|
||||
) -> ErrorResponse:
|
||||
exc: Exception | None = None
|
||||
|
||||
if isinstance(message, Exception):
|
||||
exc = message
|
||||
logger.debug(
|
||||
"create_error_response called with %s: %s", type(exc).__name__, exc
|
||||
)
|
||||
|
||||
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
|
||||
|
||||
if isinstance(exc, VLLMValidationError):
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = exc.parameter
|
||||
elif isinstance(exc, VLLMNotFoundError):
|
||||
err_type = "NotFoundError"
|
||||
status_code = HTTPStatus.NOT_FOUND
|
||||
param = None
|
||||
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
|
||||
# Common validation errors from user input
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = None
|
||||
elif isinstance(exc, NotImplementedError):
|
||||
err_type = "NotImplementedError"
|
||||
status_code = HTTPStatus.NOT_IMPLEMENTED
|
||||
param = None
|
||||
elif isinstance(exc, GenerationError):
|
||||
err_type = "InternalServerError"
|
||||
status_code = exc.status_code
|
||||
param = None
|
||||
elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__):
|
||||
# jinja2.TemplateError and its subclasses (avoid importing jinja2)
|
||||
err_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
param = None
|
||||
else:
|
||||
err_type = "InternalServerError"
|
||||
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
param = None
|
||||
|
||||
message = str(exc)
|
||||
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(
|
||||
message=sanitize_message(message),
|
||||
type=err_type,
|
||||
code=status_code.value,
|
||||
param=param,
|
||||
)
|
||||
)
|
||||
+4
-1
@@ -26,7 +26,10 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
GenerationError,
|
||||
)
|
||||
from vllm.entrypoints.utils import create_error_response, sanitize_message
|
||||
from vllm.entrypoints.serve.utils.error_response import (
|
||||
create_error_response,
|
||||
sanitize_message,
|
||||
)
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.gc_utils import freeze_gc_heap
|
||||
@@ -2,9 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import TypeVar
|
||||
|
||||
from fastapi import Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponseChoice,
|
||||
@@ -38,12 +35,3 @@ def maybe_filter_parallel_tool_calls(
|
||||
]
|
||||
|
||||
return choice
|
||||
|
||||
|
||||
async def validate_json_request(raw_request: Request):
|
||||
content_type = raw_request.headers.get("content-type", "").lower()
|
||||
media_type = content_type.split(";", maxsplit=1)[0]
|
||||
if media_type != "application/json":
|
||||
raise RequestValidationError(
|
||||
errors=["Unsupported Media Type: Only 'application/json' is allowed"]
|
||||
)
|
||||
@@ -15,7 +15,6 @@ from transformers import PreTrainedTokenizerBase
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaMessage,
|
||||
ErrorResponse,
|
||||
@@ -24,7 +23,8 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.api_utils import get_max_tokens
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.inputs import EncoderDecoderInput, EngineInput
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
|
||||
from starlette.datastructures import State
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.tasks import SupportedTask
|
||||
else:
|
||||
RequestLogger = object
|
||||
|
||||
@@ -9,9 +9,9 @@ from typing import Literal, cast
|
||||
import numpy as np
|
||||
|
||||
from vllm.engine.protocol import EngineClient, StreamingInput
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models.interfaces import SupportsRealtime
|
||||
|
||||
@@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
with_cancellation,
|
||||
)
|
||||
|
||||
@@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from fastapi import APIRouter, Form, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.utils import (
|
||||
from vllm.entrypoints.serve.utils.api_utils import (
|
||||
load_aware_call,
|
||||
with_cancellation,
|
||||
)
|
||||
|
||||
@@ -5,12 +5,12 @@ from collections.abc import AsyncGenerator
|
||||
from fastapi import Request
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.utils.request_logger import RequestLogger
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
+1
-1
@@ -359,7 +359,7 @@ class RustFrontendProcessManager:
|
||||
]
|
||||
if stats_update_address is not None:
|
||||
cmd.extend(["--coordinator-address", stats_update_address])
|
||||
from vllm.entrypoints.utils import jsonify_non_default_args
|
||||
from vllm.entrypoints.serve.utils.api_utils import jsonify_non_default_args
|
||||
|
||||
args_json = json.dumps(
|
||||
jsonify_non_default_args(args, exclude={"api_server_count"}),
|
||||
|
||||
Reference in New Issue
Block a user