diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml index d2d2905703e..586685c5a10 100644 --- a/docs/cli/.nav.yml +++ b/docs/cli/.nav.yml @@ -6,3 +6,5 @@ nav: - run-batch.md - vllm bench: - bench/**/*.md + - vllm launch: + - launch/**/*.md diff --git a/docs/cli/README.md b/docs/cli/README.md index b27bd3b647b..08e986a7463 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -9,7 +9,7 @@ vllm --help Available Commands: ```bash -vllm {chat,complete,serve,bench,collect-env,run-batch} +vllm {chat,complete,serve,launch,bench,collect-env,run-batch} ``` ## serve @@ -37,24 +37,36 @@ vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock Check with --help for more options: ```bash -# To list all groups -vllm serve --help=listgroup +# To list all flags +vllm serve --help=all -# To view a argument group +# To view an argument group vllm serve --help=ModelConfig # To view a single argument vllm serve --help=max-num-seqs -# To search by keyword +# To search by keyword or flag name vllm serve --help=max - -# To view full help with pager (less/more) -vllm serve --help=page ``` See [vllm serve](./serve.md) for the full reference of all available arguments. +## launch + +Launch individual vLLM components. + +```bash +# Launch the rendering server component +vllm launch render meta-llama/Llama-3.2-1B-Instruct + +# Inspect all available flags for the render component +vllm launch render --help=all +``` + +See [vllm launch render](./launch/render.md) for the current launch +component reference. + ## chat Generate chat completions via the running API server. diff --git a/docs/cli/launch/render.md b/docs/cli/launch/render.md new file mode 100644 index 00000000000..4d15e5f1162 --- /dev/null +++ b/docs/cli/launch/render.md @@ -0,0 +1,22 @@ +# vllm launch render + +## Overview + +`vllm launch render` starts a GPU-less rendering server for preprocessing and +postprocessing only. + +```bash +vllm launch render meta-llama/Llama-3.2-1B-Instruct --port 8100 +``` + +This command reuses the standard serving parser, so model, frontend, +networking, and related CLI options follow the same conventions as +[`vllm serve`](../serve.md). + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Arguments + +--8<-- "docs/generated/argparse/launch_render.inc.md" diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 2c19dc1763f..4548e33f881 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -147,6 +147,7 @@ AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs") EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs") ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand") CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand") +RenderSubcommand = auto_mock("vllm.entrypoints.cli.launch", "RenderSubcommand") openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args") openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch") @@ -260,6 +261,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): "serve": create_parser(openai_cli_args.make_arg_parser), "chat": create_parser(ChatCommand.add_cli_args), "complete": create_parser(CompleteCommand.add_cli_args), + "launch_render": create_parser(RenderSubcommand.add_cli_args), "run-batch": create_parser(openai_run_batch.make_arg_parser), # Benchmark CLI "bench_latency": create_parser(bench_latency.add_cli_args),