mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-8242][feat] Add stability tags for serve subcommand (#10012)
Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
This commit is contained in:
parent
0937df2c68
commit
c04cf4334e
@ -299,6 +299,8 @@ To configure the nested level arguments like ``moe_config.backend``, the yaml fi
|
||||
Syntax
|
||||
------
|
||||
|
||||
This syntax section lists all command line arguments for ``trtllm-serve``'s subcommands. Some of the arguments are accompanied with a stability tag indicating their development status. Refer to our `API Reference <https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html>`__ for details
|
||||
|
||||
.. click:: tensorrt_llm.commands.serve:main
|
||||
:prog: trtllm-serve
|
||||
:nested: full
|
||||
|
||||
@ -7,7 +7,7 @@ import socket
|
||||
import subprocess # nosec B404
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Mapping, Optional, Sequence
|
||||
from typing import Any, Dict, Literal, Mapping, Optional, Sequence
|
||||
|
||||
import click
|
||||
import torch
|
||||
@ -42,6 +42,13 @@ from tensorrt_llm.tools.importlib_utils import import_custom_module_from_dir
|
||||
_child_p_global: Optional[subprocess.Popen] = None
|
||||
|
||||
|
||||
def help_info_with_stability_tag(
|
||||
help_str: str, tag: Literal["stable", "beta", "prototype",
|
||||
"deprecated"]) -> str:
|
||||
"""Append stability info to help string."""
|
||||
return f":tag:`{tag}` {help_str}"
|
||||
|
||||
|
||||
def _signal_handler_cleanup_child(signum, frame):
|
||||
"""Signal handler to clean up the child process."""
|
||||
global _child_p_global
|
||||
@ -279,27 +286,33 @@ class ChoiceWithAlias(click.Choice):
|
||||
@click.option("--tokenizer",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path | Name of the tokenizer."
|
||||
"Specify this value only if using TensorRT engine as model.")
|
||||
help=help_info_with_stability_tag("Path | Name of the tokenizer.",
|
||||
"beta"))
|
||||
@click.option(
|
||||
"--custom_tokenizer",
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
"Custom tokenizer type: alias (e.g., 'deepseek_v32') or Python import path "
|
||||
"(e.g., 'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer'). [Experimental]"
|
||||
)
|
||||
help=help_info_with_stability_tag(
|
||||
"Custom tokenizer type: alias (e.g., 'deepseek_v32') or Python import path "
|
||||
"(e.g., 'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer').",
|
||||
"prototype"))
|
||||
@click.option("--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Hostname of the server.")
|
||||
@click.option("--port", type=int, default=8000, help="Port of the server.")
|
||||
help=help_info_with_stability_tag("Hostname of the server.",
|
||||
"beta"))
|
||||
@click.option("--port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help=help_info_with_stability_tag("Port of the server.", "beta"))
|
||||
@click.option(
|
||||
"--backend",
|
||||
type=ChoiceWithAlias(["pytorch", "tensorrt", "_autodeploy"],
|
||||
{"trt": "tensorrt"}),
|
||||
default="pytorch",
|
||||
help="The backend to use to serve the model. Default is pytorch backend.")
|
||||
help=help_info_with_stability_tag(
|
||||
"The backend to use to serve the model. Default is pytorch backend.",
|
||||
"beta"))
|
||||
@click.option(
|
||||
"--custom_module_dirs",
|
||||
type=click.Path(exists=True,
|
||||
@ -308,143 +321,170 @@ class ChoiceWithAlias(click.Choice):
|
||||
resolve_path=True),
|
||||
default=None,
|
||||
multiple=True,
|
||||
help="Paths to custom module directories to import.",
|
||||
help=help_info_with_stability_tag(
|
||||
"Paths to custom module directories to import.", "prototype"),
|
||||
)
|
||||
@click.option('--log_level',
|
||||
type=click.Choice(severity_map.keys()),
|
||||
default='info',
|
||||
help="The logging level.")
|
||||
help=help_info_with_stability_tag("The logging level.", "beta"))
|
||||
@click.option("--max_beam_width",
|
||||
type=int,
|
||||
default=BuildConfig.model_fields["max_beam_width"].default,
|
||||
help="Maximum number of beams for beam search decoding.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Maximum number of beams for beam search decoding.", "beta"))
|
||||
@click.option("--max_batch_size",
|
||||
type=int,
|
||||
default=BuildConfig.model_fields["max_batch_size"].default,
|
||||
help="Maximum number of requests that the engine can schedule.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Maximum number of requests that the engine can schedule.",
|
||||
"beta"))
|
||||
@click.option(
|
||||
"--max_num_tokens",
|
||||
type=int,
|
||||
default=BuildConfig.model_fields["max_num_tokens"].default,
|
||||
help=
|
||||
"Maximum number of batched input tokens after padding is removed in each batch."
|
||||
)
|
||||
help=help_info_with_stability_tag(
|
||||
"Maximum number of batched input tokens after padding is removed in each batch.",
|
||||
"beta"))
|
||||
@click.option(
|
||||
"--max_seq_len",
|
||||
type=int,
|
||||
default=BuildConfig.model_fields["max_seq_len"].default,
|
||||
help="Maximum total length of one request, including prompt and outputs. "
|
||||
"If unspecified, the value is deduced from the model config.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Maximum total length of one request, including prompt and outputs. "
|
||||
"If unspecified, the value is deduced from the model config.", "beta"))
|
||||
@click.option("--tensor_parallel_size",
|
||||
"--tp_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help='Tensor parallelism size.')
|
||||
help=help_info_with_stability_tag('Tensor parallelism size.',
|
||||
'beta'))
|
||||
@click.option("--pipeline_parallel_size",
|
||||
"--pp_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help='Pipeline parallelism size.')
|
||||
help=help_info_with_stability_tag('Pipeline parallelism size.',
|
||||
'beta'))
|
||||
@click.option("--context_parallel_size",
|
||||
"--cp_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help='Context parallelism size.')
|
||||
help=help_info_with_stability_tag('Context parallelism size.',
|
||||
'beta'))
|
||||
@click.option("--moe_expert_parallel_size",
|
||||
"--ep_size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="expert parallelism size")
|
||||
help=help_info_with_stability_tag("expert parallelism size",
|
||||
"beta"))
|
||||
@click.option("--moe_cluster_parallel_size",
|
||||
"--cluster_size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="expert cluster parallelism size")
|
||||
@click.option("--gpus_per_node",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of GPUs per node. Default to None, and it will be "
|
||||
"detected automatically.")
|
||||
help=help_info_with_stability_tag(
|
||||
"expert cluster parallelism size", "beta"))
|
||||
@click.option(
|
||||
"--gpus_per_node",
|
||||
type=int,
|
||||
default=None,
|
||||
help=help_info_with_stability_tag(
|
||||
"Number of GPUs per node. Default to None, and it will be detected automatically.",
|
||||
"beta"))
|
||||
@click.option("--free_gpu_memory_fraction",
|
||||
"--kv_cache_free_gpu_memory_fraction",
|
||||
type=float,
|
||||
default=0.9,
|
||||
help="Free GPU memory fraction reserved for KV Cache, "
|
||||
"after allocating model weights and buffers.")
|
||||
@click.option(
|
||||
"--num_postprocess_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help="[Experimental] Number of workers to postprocess raw responses "
|
||||
"to comply with OpenAI protocol.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Free GPU memory fraction reserved for KV Cache, "
|
||||
"after allocating model weights and buffers.", "beta"))
|
||||
@click.option("--num_postprocess_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=help_info_with_stability_tag(
|
||||
"Number of workers to postprocess raw responses "
|
||||
"to comply with OpenAI protocol.", "prototype"))
|
||||
@click.option("--trust_remote_code",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Flag for HF transformers.")
|
||||
help=help_info_with_stability_tag("Flag for HF transformers.",
|
||||
"beta"))
|
||||
@click.option("--revision",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The revision to use for the HuggingFace model "
|
||||
"(branch name, tag name, or commit id).")
|
||||
help=help_info_with_stability_tag(
|
||||
"The revision to use for the HuggingFace model "
|
||||
"(branch name, tag name, or commit id).", "beta"))
|
||||
@click.option(
|
||||
"--config",
|
||||
"--extra_llm_api_options",
|
||||
"extra_llm_api_options",
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
"Path to a YAML file that overwrites the parameters specified by trtllm-serve. "
|
||||
"Can be specified as either --config or --extra_llm_api_options.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Path to a YAML file that overwrites the parameters specified by trtllm-serve. "
|
||||
"Can be specified as either --config or --extra_llm_api_options.",
|
||||
"prototype"))
|
||||
@click.option(
|
||||
"--reasoning_parser",
|
||||
type=click.Choice(ReasoningParserFactory.parsers.keys()),
|
||||
default=None,
|
||||
help="[Experimental] Specify the parser for reasoning models.",
|
||||
help=help_info_with_stability_tag(
|
||||
"Specify the parser for reasoning models.", "prototype"),
|
||||
)
|
||||
@click.option(
|
||||
"--tool_parser",
|
||||
type=click.Choice(ToolParserFactory.parsers.keys()),
|
||||
default=None,
|
||||
help="[Experimental] Specify the parser for tool models.",
|
||||
help=help_info_with_stability_tag("Specify the parser for tool models.",
|
||||
"prototype"),
|
||||
)
|
||||
@click.option("--metadata_server_config_file",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to metadata server config file")
|
||||
help=help_info_with_stability_tag(
|
||||
"Path to metadata server config file", "prototype"))
|
||||
@click.option(
|
||||
"--server_role",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Server role. Specify this value only if running in disaggregated mode."
|
||||
)
|
||||
help=help_info_with_stability_tag(
|
||||
"Server role. Specify this value only if running in disaggregated mode.",
|
||||
"prototype"))
|
||||
@click.option(
|
||||
"--fail_fast_on_attention_window_too_large",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help=
|
||||
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
|
||||
)
|
||||
help=help_info_with_stability_tag(
|
||||
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.",
|
||||
"prototype"))
|
||||
@click.option("--otlp_traces_endpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Target URL to which OpenTelemetry traces will be sent.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Target URL to which OpenTelemetry traces will be sent.",
|
||||
"prototype"))
|
||||
@click.option("--disagg_cluster_uri",
|
||||
type=str,
|
||||
default=None,
|
||||
help="URI of the disaggregated cluster.")
|
||||
help=help_info_with_stability_tag(
|
||||
"URI of the disaggregated cluster.", "prototype"))
|
||||
@click.option("--enable_chunked_prefill",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Enable chunked prefill")
|
||||
help=help_info_with_stability_tag("Enable chunked prefill",
|
||||
"prototype"))
|
||||
@click.option("--media_io_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Keyword arguments for media I/O.")
|
||||
help=help_info_with_stability_tag(
|
||||
"Keyword arguments for media I/O.", "prototype"))
|
||||
@click.option("--chat_template",
|
||||
type=str,
|
||||
default=None,
|
||||
help="[Experimental] Specify a custom chat template. "
|
||||
"Can be a file path or one-liner template string")
|
||||
help=help_info_with_stability_tag(
|
||||
"Specify a custom chat template. "
|
||||
"Can be a file path or one-liner template string",
|
||||
"prototype"))
|
||||
def serve(
|
||||
model: str, tokenizer: Optional[str], custom_tokenizer: Optional[str],
|
||||
host: str, port: int, log_level: str, backend: str, max_beam_width: int,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user