mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
282 lines
8.3 KiB
YAML
282 lines
8.3 KiB
YAML
methods:
|
|
__init__:
|
|
parameters:
|
|
# Parallelism
|
|
gpus_per_node:
|
|
annotation: Optional[int]
|
|
default: null
|
|
status: beta
|
|
moe_cluster_parallel_size:
|
|
annotation: Optional[int]
|
|
default: null
|
|
status: beta
|
|
enable_attention_dp:
|
|
annotation: bool
|
|
default: False
|
|
status: beta
|
|
cp_config:
|
|
annotation: Optional[dict]
|
|
default: null
|
|
status: prototype
|
|
pp_partition:
|
|
annotation: Optional[List[int]]
|
|
default: null
|
|
status: prototype
|
|
# Stats
|
|
iter_stats_max_iterations:
|
|
annotation: Optional[int]
|
|
default: null
|
|
status: prototype
|
|
request_stats_max_iterations:
|
|
annotation: Optional[int]
|
|
default: null
|
|
status: prototype
|
|
return_perf_metrics:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
# Bindings and mirrored configs
|
|
peft_cache_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
|
|
default: null
|
|
status: prototype
|
|
scheduler_config:
|
|
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
|
|
default: null
|
|
status: prototype
|
|
cache_transceiver_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
|
|
default: null
|
|
status: prototype
|
|
gather_generation_logits:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
num_postprocess_workers:
|
|
annotation: int
|
|
default: 0
|
|
status: prototype
|
|
postprocess_tokenizer_dir:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: prototype
|
|
# reasoning
|
|
reasoning_parser:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: prototype
|
|
# Runtime behavior
|
|
fail_fast_on_attention_window_too_large:
|
|
annotation: bool
|
|
default: false
|
|
status: prototype
|
|
garbage_collection_gen0_threshold:
|
|
annotation: int
|
|
default: 20000
|
|
status: beta
|
|
# Misc
|
|
backend:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: deprecated
|
|
build_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
|
|
default: null
|
|
status: deprecated
|
|
cuda_graph_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig]
|
|
default: null
|
|
status: beta
|
|
attention_dp_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.AttentionDpConfig]
|
|
default: null
|
|
status: beta
|
|
checkpoint_loader:
|
|
annotation: Optional[tensorrt_llm._torch.models.checkpoints.BaseCheckpointLoader]
|
|
default: null
|
|
status: prototype
|
|
checkpoint_format:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: prototype
|
|
mm_encoder_only:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
disable_overlap_scheduler:
|
|
annotation: bool
|
|
default: False
|
|
status: beta
|
|
disable_flashinfer_sampling:
|
|
annotation: bool
|
|
default: True
|
|
status: prototype
|
|
moe_config:
|
|
annotation: tensorrt_llm.llmapi.llm_args.MoeConfig
|
|
status: beta
|
|
default: null
|
|
attn_backend:
|
|
annotation: str
|
|
default: TRTLLM
|
|
status: beta
|
|
sampler_type:
|
|
annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType]
|
|
default: auto
|
|
status: beta
|
|
enable_iter_perf_stats:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
enable_iter_req_stats:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
batch_wait_timeout_ms:
|
|
annotation: float
|
|
default: 0
|
|
status: prototype
|
|
batch_wait_timeout_iters:
|
|
annotation: int
|
|
default: 0
|
|
status: prototype
|
|
batch_wait_max_tokens_ratio:
|
|
annotation: float
|
|
default: 0
|
|
status: prototype
|
|
print_iter_log:
|
|
annotation: bool
|
|
default: False
|
|
status: beta
|
|
perf_metrics_max_requests:
|
|
annotation: int
|
|
default: 0
|
|
status: prototype
|
|
torch_compile_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
|
|
default: null
|
|
status: prototype
|
|
enable_autotuner:
|
|
annotation: bool
|
|
default: True
|
|
status: prototype
|
|
enable_layerwise_nvtx_marker:
|
|
annotation: bool
|
|
default: False
|
|
status: beta
|
|
enable_min_latency:
|
|
annotation: bool
|
|
default: False
|
|
status: beta
|
|
force_dynamic_quantization:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
allreduce_strategy:
|
|
annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']]
|
|
default: AUTO
|
|
status: beta
|
|
decoding_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
|
|
default: null
|
|
status: deprecated
|
|
kv_connector_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConnectorConfig]
|
|
default: null
|
|
status: prototype
|
|
enable_lm_head_tp_in_adp:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
orchestrator_type:
|
|
annotation: Optional[Literal["rpc", "ray"]]
|
|
default: null
|
|
status: prototype
|
|
sparse_attention_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.SparseAttentionConfig]
|
|
default: null
|
|
status: prototype
|
|
otlp_traces_endpoint:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: prototype
|
|
ray_worker_extension_cls:
|
|
annotation: Optional[str]
|
|
default: null
|
|
status: prototype
|
|
enable_sleep:
|
|
annotation: bool
|
|
default: False
|
|
status: prototype
|
|
return_annotation: None
|
|
generate:
|
|
parameters:
|
|
disaggregated_params:
|
|
annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
|
|
default: null
|
|
kv_cache_retention_config:
|
|
annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
|
|
default: null
|
|
scheduling_params:
|
|
annotation: Union[tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], NoneType]
|
|
default: null
|
|
cache_salt:
|
|
annotation: Union[str, Sequence[str], NoneType]
|
|
default: null
|
|
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
|
|
generate_async:
|
|
parameters:
|
|
disaggregated_params:
|
|
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
|
default: null
|
|
kv_cache_retention_config:
|
|
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
|
|
default: null
|
|
scheduling_params:
|
|
annotation: Optional[tensorrt_llm.scheduling_params.SchedulingParams]
|
|
default: null
|
|
status: prototype
|
|
cache_salt:
|
|
annotation: Optional[str]
|
|
default: null
|
|
trace_headers:
|
|
annotation: Optional[Mapping[str, str]]
|
|
default: null
|
|
status: prototype
|
|
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
|
|
get_kv_cache_events:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: List[dict]
|
|
status: beta
|
|
get_kv_cache_events_async:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: tensorrt_llm.executor.result.IterationResult
|
|
status: beta
|
|
get_stats:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: List[dict]
|
|
status: beta
|
|
get_stats_async:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: tensorrt_llm.executor.result.IterationResult
|
|
status: beta
|
|
shutdown:
|
|
parameters: {}
|
|
return_annotation: None
|
|
status: beta
|
|
properties:
|
|
llm_id:
|
|
annotation: str
|
|
default: inspect._empty
|