From b623f7ea95118c353eaed0301e9a6e7774b0e861 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 2 Jun 2026 21:30:21 +0800 Subject: [PATCH] [Frontend] Consolidate dev entrypoints. (#44170) Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 24 ++++---- .buildkite/test_areas/entrypoints.yaml | 14 ++--- .buildkite/test_areas/rust_frontend.yaml | 8 +-- docs/serving/online_serving/README.md | 34 ++++++++++- .../{rpc => serve/dev}/__init__.py | 0 .../entrypoints/serve/dev/rpc}/__init__.py | 0 .../dev}/rpc/test_collective_rpc.py | 2 +- .../{instrumentator => dev}/test_sleep.py | 0 vllm/entrypoints/openai/api_server.py | 15 ++--- vllm/entrypoints/serve/__init__.py | 56 ++++++++++--------- .../serve/{rlhf => dev}/__init__.py | 0 .../serve/{rpc => dev/cache}/__init__.py | 0 .../serve/{ => dev}/cache/api_router.py | 3 - .../serve/{sleep => dev/rlhf}/__init__.py | 0 .../serve/{ => dev}/rlhf/api_router.py | 3 - vllm/entrypoints/serve/dev/rpc/__init__.py | 0 .../serve/{ => dev}/rpc/api_router.py | 3 - .../serve/dev/server_info/__init__.py | 0 .../server_info/api_router.py} | 6 +- vllm/entrypoints/serve/dev/sleep/__init__.py | 0 .../serve/{ => dev}/sleep/api_router.py | 4 -- .../serve/instrumentator/__init__.py | 7 --- 22 files changed, 95 insertions(+), 84 deletions(-) rename tests/entrypoints/{rpc => serve/dev}/__init__.py (100%) rename {vllm/entrypoints/serve/cache => tests/entrypoints/serve/dev/rpc}/__init__.py (100%) rename tests/entrypoints/{ => serve/dev}/rpc/test_collective_rpc.py (96%) rename tests/entrypoints/serve/{instrumentator => dev}/test_sleep.py (100%) rename vllm/entrypoints/serve/{rlhf => dev}/__init__.py (100%) rename vllm/entrypoints/serve/{rpc => dev/cache}/__init__.py (100%) rename vllm/entrypoints/serve/{ => dev}/cache/api_router.py (96%) rename vllm/entrypoints/serve/{sleep => dev/rlhf}/__init__.py (100%) rename vllm/entrypoints/serve/{ => dev}/rlhf/api_router.py (98%) create mode 100644 vllm/entrypoints/serve/dev/rpc/__init__.py rename vllm/entrypoints/serve/{ => dev}/rpc/api_router.py (95%) create mode 100644 vllm/entrypoints/serve/dev/server_info/__init__.py rename vllm/entrypoints/serve/{instrumentator/server_info.py => dev/server_info/api_router.py} (93%) create mode 100644 vllm/entrypoints/serve/dev/sleep/__init__.py rename vllm/entrypoints/serve/{ => dev}/sleep/api_router.py (95%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a04e88b3d7e..a7e26280c90 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1238,14 +1238,11 @@ steps: working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/serve/instrumentator - - tests/tool_use + - tests/entrypoints/serve commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/serve/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use + - pytest -v -s entrypoints/serve --ignore=entrypoints/serve/dev/rpc + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/serve/dev/rpc - label: Entrypoints Integration (API Server openai - Part 1) # TBD timeout_in_minutes: 180 @@ -1276,11 +1273,13 @@ steps: - tests/entrypoints/openai - tests/entrypoints/test_chat_utils - tests/entrypoints/generate + - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/generate + - pytest -v -s tool_use - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 @@ -1370,7 +1369,7 @@ steps: - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: OpenAI API correctness # TBD timeout_in_minutes: 180 @@ -2747,14 +2746,11 @@ steps: working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/serve/instrumentator - - tests/tool_use + - tests/entrypoints/serve commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/serve/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use + - pytest -v -s entrypoints/serve --ignore=entrypoints/serve/dev/rpc + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/serve/dev/rpc - label: Entrypoints Integration (API Server openai - Part 1) # TBD timeout_in_minutes: 180 @@ -2785,11 +2781,13 @@ steps: - tests/entrypoints/openai - tests/entrypoints/test_chat_utils - tests/entrypoints/generate + - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/generate + - pytest -v -s tool_use - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index ebaec9954a3..548174ed748 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -11,7 +11,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/offline_mode --ignore=entrypoints/openai --ignore=entrypoints/serve --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: Entrypoints Integration (LLM) key: entrypoints-integration-llm @@ -61,10 +61,12 @@ steps: - tests/entrypoints/openai - tests/entrypoints/test_chat_utils - tests/entrypoints/generate + - tests/tool_use commands: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/generate + - pytest -v -s tool_use mirror: amd: device: mi325_1 @@ -100,14 +102,11 @@ steps: working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/entrypoints/rpc - - tests/entrypoints/serve/instrumentator - - tests/tool_use + - tests/entrypoints/serve commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/serve/instrumentator - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use + - pytest -v -s entrypoints/serve --ignore=entrypoints/serve/dev/rpc + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/serve/dev/rpc mirror: amd: device: mi325_1 @@ -155,6 +154,5 @@ steps: source_file_dependencies: - csrc/ - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py commands: # LMEval - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml index df37022725f..16d69f77345 100644 --- a/.buildkite/test_areas/rust_frontend.yaml +++ b/.buildkite/test_areas/rust_frontend.yaml @@ -45,19 +45,19 @@ steps: - vllm/entrypoints/serve/ - vllm/v1/engine/ - tests/utils.py - # - tests/entrypoints/rpc/test_collective_rpc.py + # - tests/entrypoints/serve/dev/rpc/test_collective_rpc.py - tests/entrypoints/serve/disagg/test_serving_tokens.py - tests/entrypoints/serve/instrumentator/test_basic.py - tests/entrypoints/serve/instrumentator/test_metrics.py - # - tests/entrypoints/serve/instrumentator/test_sleep.py + # - tests/entrypoints/serve/dev/test_sleep.py commands: - export VLLM_USE_RUST_FRONTEND=1 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # - pytest -v -s entrypoints/rpc/test_collective_rpc.py + # - pytest -v -s entrypoints/serve/dev/rpc/test_collective_rpc.py - pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load" - pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow" - pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist" - # - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py + # - pytest -v -s entrypoints/serve/dev/test_sleep.py - label: Rust Frontend Core Correctness timeout_in_minutes: 30 diff --git a/docs/serving/online_serving/README.md b/docs/serving/online_serving/README.md index c8437704447..9fa1763108c 100644 --- a/docs/serving/online_serving/README.md +++ b/docs/serving/online_serving/README.md @@ -100,14 +100,44 @@ For further details on renderer APIs, please refer to [this page](renderer.md). - `/version` - Version information - `/load` - Server load metrics -## Sleep Mode APIs +## Server in development mode + +When using the flag VLLM_SERVER_DEV_MODE=1, you enable development endpoints. + +**SECURITY WARNING: These endpoints should NOT be used in production!** + +### Cache Management APIs + +- `/reset_prefix_cache` - Reset prefix cache (can disrupt service) +- `/reset_mm_cache` - Reset multimodal cache (can disrupt service) +- `/reset_encoder_cache` - Reset encoder cache (can disrupt service) + +### Weight Transfer APIs (RL Training) + +For further details on Weight Transfer, please refer to [this page](../../training/weight_transfer/README.md). + +- `/pause` - Pause generation (causes denial of service) +- `/resume` - Resume generation +- `/is_paused` - Check if generation is paused +- `/init_weight_transfer_engine` - Initialize weight transfer engine for RLHF +- `/update_weights` - Update model weights (can alter model behavior) +- `/get_world_size` - Get distributed world size + +### Collective RPC + +- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous) + +### Server info + +- `/server_info` - Get detailed server configuration + +### Sleep Mode APIs For further details on sleep mode, please refer to [this page](../../features/sleep_mode.md). - `/sleep` - Put engine to sleep (causes denial of service) - `/wake_up` - Wake engine from sleep - `/is_sleeping` - Check if engine is sleeping -- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous) ## Chat Template diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/serve/dev/__init__.py similarity index 100% rename from tests/entrypoints/rpc/__init__.py rename to tests/entrypoints/serve/dev/__init__.py diff --git a/vllm/entrypoints/serve/cache/__init__.py b/tests/entrypoints/serve/dev/rpc/__init__.py similarity index 100% rename from vllm/entrypoints/serve/cache/__init__.py rename to tests/entrypoints/serve/dev/rpc/__init__.py diff --git a/tests/entrypoints/rpc/test_collective_rpc.py b/tests/entrypoints/serve/dev/rpc/test_collective_rpc.py similarity index 96% rename from tests/entrypoints/rpc/test_collective_rpc.py rename to tests/entrypoints/serve/dev/rpc/test_collective_rpc.py index 56d93a42731..eb9aa7663c9 100644 --- a/tests/entrypoints/rpc/test_collective_rpc.py +++ b/tests/entrypoints/serve/dev/rpc/test_collective_rpc.py @@ -37,7 +37,7 @@ def server(): "--max-num-seqs", "128", "--worker-extension-cls", - "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension", + "tests.entrypoints.serve.dev.rpc.test_collective_rpc.TestWorkerExtension", ] with RemoteOpenAIServer( MODEL_NAME, diff --git a/tests/entrypoints/serve/instrumentator/test_sleep.py b/tests/entrypoints/serve/dev/test_sleep.py similarity index 100% rename from tests/entrypoints/serve/instrumentator/test_sleep.py rename to tests/entrypoints/serve/dev/test_sleep.py diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5455f1ca427..892f9d82d70 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -43,9 +43,7 @@ from vllm.entrypoints.openai.server_utils import ( validation_exception_handler, ) from vllm.entrypoints.sagemaker.api_router import sagemaker_standards_bootstrap -from vllm.entrypoints.serve.elastic_ep.middleware import ( - ScalingMiddleware, -) +from vllm.entrypoints.serve.elastic_ep.middleware import ScalingMiddleware from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization from vllm.entrypoints.utils import ( @@ -195,6 +193,11 @@ def build_app( register_sagemaker_api_router(app, supported_tasks, model_config) + if envs.VLLM_SERVER_DEV_MODE: + from vllm.entrypoints.serve import register_vllm_dev_api_routers + + register_vllm_dev_api_routers(app) + if "generate" in supported_tasks: from vllm.entrypoints.generate.api_router import ( register_generate_api_routers, @@ -208,12 +211,6 @@ def build_app( attach_disagg_router(app) - from vllm.entrypoints.serve.rlhf.api_router import ( - attach_router as attach_rlhf_router, - ) - - attach_rlhf_router(app) - from vllm.entrypoints.serve.elastic_ep.api_router import ( attach_router as elastic_ep_attach_router, ) diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py index 8233d3324d6..57491d45f63 100644 --- a/vllm/entrypoints/serve/__init__.py +++ b/vllm/entrypoints/serve/__init__.py @@ -3,18 +3,15 @@ from fastapi import FastAPI -import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) def register_vllm_serve_api_routers(app: FastAPI): - if envs.VLLM_SERVER_DEV_MODE: - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) + from .instrumentator import register_instrumentator_api_routers + + register_instrumentator_api_routers(app) from vllm.entrypoints.serve.lora.api_router import ( attach_router as attach_lora_router, @@ -28,30 +25,37 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_profile_router(app) - from vllm.entrypoints.serve.sleep.api_router import ( - attach_router as attach_sleep_router, - ) - - attach_sleep_router(app) - - from vllm.entrypoints.serve.rpc.api_router import ( - attach_router as attach_rpc_router, - ) - - attach_rpc_router(app) - - from vllm.entrypoints.serve.cache.api_router import ( - attach_router as attach_cache_router, - ) - - attach_cache_router(app) - from vllm.entrypoints.serve.tokenize.api_router import ( attach_router as attach_tokenize_router, ) attach_tokenize_router(app) - from .instrumentator import register_instrumentator_api_routers - register_instrumentator_api_routers(app) +def register_vllm_dev_api_routers(app: FastAPI): + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + + from .dev.cache.api_router import attach_router as attach_cache_router + + attach_cache_router(app) + + from .dev.rlhf.api_router import attach_router as attach_rlhf_router + + attach_rlhf_router(app) + + from .dev.rpc.api_router import attach_router as attach_rpc_router + + attach_rpc_router(app) + + from .dev.server_info.api_router import ( + attach_router as attach_server_info_router, + ) + + attach_server_info_router(app) + + from .dev.sleep.api_router import attach_router as attach_sleep_router + + attach_sleep_router(app) diff --git a/vllm/entrypoints/serve/rlhf/__init__.py b/vllm/entrypoints/serve/dev/__init__.py similarity index 100% rename from vllm/entrypoints/serve/rlhf/__init__.py rename to vllm/entrypoints/serve/dev/__init__.py diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/dev/cache/__init__.py similarity index 100% rename from vllm/entrypoints/serve/rpc/__init__.py rename to vllm/entrypoints/serve/dev/cache/__init__.py diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/dev/cache/api_router.py similarity index 96% rename from vllm/entrypoints/serve/cache/api_router.py rename to vllm/entrypoints/serve/dev/cache/api_router.py index 10015f02caa..c274717c0a8 100644 --- a/vllm/entrypoints/serve/cache/api_router.py +++ b/vllm/entrypoints/serve/dev/cache/api_router.py @@ -5,7 +5,6 @@ from fastapi import APIRouter, FastAPI, Query, Request from fastapi.responses import Response -import vllm.envs as envs from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -67,6 +66,4 @@ async def reset_encoder_cache(raw_request: Request): def attach_router(app: FastAPI): - if not envs.VLLM_SERVER_DEV_MODE: - return app.include_router(router) diff --git a/vllm/entrypoints/serve/sleep/__init__.py b/vllm/entrypoints/serve/dev/rlhf/__init__.py similarity index 100% rename from vllm/entrypoints/serve/sleep/__init__.py rename to vllm/entrypoints/serve/dev/rlhf/__init__.py diff --git a/vllm/entrypoints/serve/rlhf/api_router.py b/vllm/entrypoints/serve/dev/rlhf/api_router.py similarity index 98% rename from vllm/entrypoints/serve/rlhf/api_router.py rename to vllm/entrypoints/serve/dev/rlhf/api_router.py index dcae3889dc7..6237de87769 100644 --- a/vllm/entrypoints/serve/rlhf/api_router.py +++ b/vllm/entrypoints/serve/dev/rlhf/api_router.py @@ -8,7 +8,6 @@ from typing import Annotated from fastapi import APIRouter, FastAPI, HTTPException, Query, Request from fastapi.responses import JSONResponse -import vllm.envs as envs from vllm.distributed.weight_transfer.base import ( WeightTransferInitRequest, WeightTransferUpdateRequest, @@ -186,6 +185,4 @@ async def get_world_size( def attach_router(app: FastAPI): - if not envs.VLLM_SERVER_DEV_MODE: - return app.include_router(router) diff --git a/vllm/entrypoints/serve/dev/rpc/__init__.py b/vllm/entrypoints/serve/dev/rpc/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/dev/rpc/api_router.py similarity index 95% rename from vllm/entrypoints/serve/rpc/api_router.py rename to vllm/entrypoints/serve/dev/rpc/api_router.py index 54f582c408d..99b904c2f63 100644 --- a/vllm/entrypoints/serve/rpc/api_router.py +++ b/vllm/entrypoints/serve/dev/rpc/api_router.py @@ -8,7 +8,6 @@ from typing import Any from fastapi import APIRouter, FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, Response -import vllm.envs as envs from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -56,6 +55,4 @@ async def collective_rpc(raw_request: Request): def attach_router(app: FastAPI): - if not envs.VLLM_SERVER_DEV_MODE: - return app.include_router(router) diff --git a/vllm/entrypoints/serve/dev/server_info/__init__.py b/vllm/entrypoints/serve/dev/server_info/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/dev/server_info/api_router.py similarity index 93% rename from vllm/entrypoints/serve/instrumentator/server_info.py rename to vllm/entrypoints/serve/dev/server_info/api_router.py index 60967c5a66a..64b7cdeb2fb 100644 --- a/vllm/entrypoints/serve/instrumentator/server_info.py +++ b/vllm/entrypoints/serve/dev/server_info/api_router.py @@ -7,7 +7,7 @@ import functools from typing import Annotated, Literal import pydantic -from fastapi import APIRouter, Query, Request +from fastapi import APIRouter, FastAPI, Query, Request from fastapi.responses import JSONResponse import vllm.envs as envs @@ -57,3 +57,7 @@ async def show_server_info( "system_env": await asyncio.to_thread(_get_system_env_info_cached), } return JSONResponse(content=server_info) + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/serve/dev/sleep/__init__.py b/vllm/entrypoints/serve/dev/sleep/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/dev/sleep/api_router.py similarity index 95% rename from vllm/entrypoints/serve/sleep/api_router.py rename to vllm/entrypoints/serve/dev/sleep/api_router.py index 46fa1c3f43f..0861c867732 100644 --- a/vllm/entrypoints/serve/sleep/api_router.py +++ b/vllm/entrypoints/serve/dev/sleep/api_router.py @@ -5,7 +5,6 @@ from fastapi import APIRouter, FastAPI, Request from fastapi.responses import JSONResponse, Response -import vllm.envs as envs from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -50,7 +49,4 @@ async def is_sleeping(raw_request: Request): def attach_router(app: FastAPI): - if not envs.VLLM_SERVER_DEV_MODE: - return - app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/__init__.py b/vllm/entrypoints/serve/instrumentator/__init__.py index 8abce02325a..c987394ad03 100644 --- a/vllm/entrypoints/serve/instrumentator/__init__.py +++ b/vllm/entrypoints/serve/instrumentator/__init__.py @@ -3,8 +3,6 @@ from fastapi import FastAPI -from vllm import envs - def register_instrumentator_api_routers(app: FastAPI): from .basic import router as basic_router @@ -22,8 +20,3 @@ def register_instrumentator_api_routers(app: FastAPI): from .offline_docs import attach_router as offline_docs_attach_router offline_docs_attach_router(app) - - if envs.VLLM_SERVER_DEV_MODE: - from .server_info import router as server_info_router - - app.include_router(server_info_router)