From a43bc34baf2a2e367cdbbdaec44de8cf5de92d1a Mon Sep 17 00:00:00 2001 From: Ethan Feng Date: Sat, 9 May 2026 10:03:52 +0800 Subject: [PATCH] [Docs] Update server entrypoint examples (#42077) Signed-off-by: Ethan Feng --- docs/deployment/frameworks/runpod.md | 3 +-- docs/deployment/integrations/kthena.md | 5 ++--- docs/design/lora_resolver_plugins.md | 3 +-- docs/design/optimization_levels.md | 2 +- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/deployment/frameworks/runpod.md b/docs/deployment/frameworks/runpod.md index 61ca3c4e68c..b00350386e4 100644 --- a/docs/deployment/frameworks/runpod.md +++ b/docs/deployment/frameworks/runpod.md @@ -12,8 +12,7 @@ vLLM can be deployed on [RunPod](https://www.runpod.io/), a cloud GPU platform t SSH into your RunPod pod and launch the vLLM OpenAI-compatible server: ```bash -python -m vllm.entrypoints.openai.api_server \ - --model \ +vllm serve \ --host 0.0.0.0 \ --port 8000 ``` diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md index 0989e5d67f0..03ef190e558 100644 --- a/docs/deployment/integrations/kthena.md +++ b/docs/deployment/integrations/kthena.md @@ -79,9 +79,8 @@ Key points from the example YAML: - -c - > bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; - python3 -m vllm.entrypoints.openai.api_server + vllm serve meta-llama/Llama-3.1-405B-Instruct --port 8080 - --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2 ``` @@ -145,7 +144,7 @@ spec: - sh - -c - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2" + vllm serve meta-llama/Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline-parallel-size 2" resources: limits: nvidia.com/gpu: "8" diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md index ad644cbc50a..019c4eef93d 100644 --- a/docs/design/lora_resolver_plugins.md +++ b/docs/design/lora_resolver_plugins.md @@ -62,8 +62,7 @@ The filesystem resolver is installed with vLLM by default and enables loading Lo 3. **Start vLLM server**: Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`. ```bash - python -m vllm.entrypoints.openai.api_server \ - --model your-base-model \ + vllm serve your-base-model \ --enable-lora ``` diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md index dd0936ca9e5..3261aed7479 100644 --- a/docs/design/optimization_levels.md +++ b/docs/design/optimization_levels.md @@ -16,7 +16,7 @@ User-set flags take precedence over optimization level defaults. ```bash # CLI usage -python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1 +vllm serve RedHatAI/Llama-3.2-1B-FP8 -O1 # Python API usage from vllm.entrypoints.llm import LLM