mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Doc] Sync CLI guide with actual help modes and launch subcommand (#40326)
Signed-off-by: Rui Wang <raygorous@gmail.com> Co-authored-by: Rui Wang <raygorous@gmail.com>
This commit is contained in:
@@ -6,3 +6,5 @@ nav:
|
||||
- run-batch.md
|
||||
- vllm bench:
|
||||
- bench/**/*.md
|
||||
- vllm launch:
|
||||
- launch/**/*.md
|
||||
|
||||
+20
-8
@@ -9,7 +9,7 @@ vllm --help
|
||||
Available Commands:
|
||||
|
||||
```bash
|
||||
vllm {chat,complete,serve,bench,collect-env,run-batch}
|
||||
vllm {chat,complete,serve,launch,bench,collect-env,run-batch}
|
||||
```
|
||||
|
||||
## serve
|
||||
@@ -37,24 +37,36 @@ vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
|
||||
Check with --help for more options:
|
||||
|
||||
```bash
|
||||
# To list all groups
|
||||
vllm serve --help=listgroup
|
||||
# To list all flags
|
||||
vllm serve --help=all
|
||||
|
||||
# To view a argument group
|
||||
# To view an argument group
|
||||
vllm serve --help=ModelConfig
|
||||
|
||||
# To view a single argument
|
||||
vllm serve --help=max-num-seqs
|
||||
|
||||
# To search by keyword
|
||||
# To search by keyword or flag name
|
||||
vllm serve --help=max
|
||||
|
||||
# To view full help with pager (less/more)
|
||||
vllm serve --help=page
|
||||
```
|
||||
|
||||
See [vllm serve](./serve.md) for the full reference of all available arguments.
|
||||
|
||||
## launch
|
||||
|
||||
Launch individual vLLM components.
|
||||
|
||||
```bash
|
||||
# Launch the rendering server component
|
||||
vllm launch render meta-llama/Llama-3.2-1B-Instruct
|
||||
|
||||
# Inspect all available flags for the render component
|
||||
vllm launch render --help=all
|
||||
```
|
||||
|
||||
See [vllm launch render](./launch/render.md) for the current launch
|
||||
component reference.
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# vllm launch render
|
||||
|
||||
## Overview
|
||||
|
||||
`vllm launch render` starts a GPU-less rendering server for preprocessing and
|
||||
postprocessing only.
|
||||
|
||||
```bash
|
||||
vllm launch render meta-llama/Llama-3.2-1B-Instruct --port 8100
|
||||
```
|
||||
|
||||
This command reuses the standard serving parser, so model, frontend,
|
||||
networking, and related CLI options follow the same conventions as
|
||||
[`vllm serve`](../serve.md).
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/generated/argparse/launch_render.inc.md"
|
||||
@@ -147,6 +147,7 @@ AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
|
||||
EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
|
||||
ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
|
||||
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
|
||||
RenderSubcommand = auto_mock("vllm.entrypoints.cli.launch", "RenderSubcommand")
|
||||
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
|
||||
@@ -260,6 +261,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"serve": create_parser(openai_cli_args.make_arg_parser),
|
||||
"chat": create_parser(ChatCommand.add_cli_args),
|
||||
"complete": create_parser(CompleteCommand.add_cli_args),
|
||||
"launch_render": create_parser(RenderSubcommand.add_cli_args),
|
||||
"run-batch": create_parser(openai_run_batch.make_arg_parser),
|
||||
# Benchmark CLI
|
||||
"bench_latency": create_parser(bench_latency.add_cli_args),
|
||||
|
||||
Reference in New Issue
Block a user