TensorRT-LLMs/tests/unittest/api_stability/references/llm.yaml
mpikulski bc355eadf5
[TRTLLM-9488][fix] llmapi references (#9547)
Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
2025-11-28 08:54:05 -08:00

282 lines
8.3 KiB
YAML

methods:
__init__:
parameters:
# Parallelism
gpus_per_node:
annotation: Optional[int]
default: null
status: beta
moe_cluster_parallel_size:
annotation: Optional[int]
default: null
status: beta
enable_attention_dp:
annotation: bool
default: False
status: beta
cp_config:
annotation: Optional[dict]
default: null
status: prototype
pp_partition:
annotation: Optional[List[int]]
default: null
status: prototype
# Stats
iter_stats_max_iterations:
annotation: Optional[int]
default: null
status: prototype
request_stats_max_iterations:
annotation: Optional[int]
default: null
status: prototype
return_perf_metrics:
annotation: bool
default: False
status: prototype
# Bindings and mirrored configs
peft_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
default: null
status: prototype
scheduler_config:
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
default: null
status: prototype
cache_transceiver_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
default: null
status: prototype
gather_generation_logits:
annotation: bool
default: False
status: prototype
num_postprocess_workers:
annotation: int
default: 0
status: prototype
postprocess_tokenizer_dir:
annotation: Optional[str]
default: null
status: prototype
# reasoning
reasoning_parser:
annotation: Optional[str]
default: null
status: prototype
# Runtime behavior
fail_fast_on_attention_window_too_large:
annotation: bool
default: false
status: prototype
garbage_collection_gen0_threshold:
annotation: int
default: 20000
status: beta
# Misc
backend:
annotation: Optional[str]
default: null
status: deprecated
build_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
default: null
status: deprecated
cuda_graph_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig]
default: null
status: beta
attention_dp_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.AttentionDpConfig]
default: null
status: beta
checkpoint_loader:
annotation: Optional[tensorrt_llm._torch.models.checkpoints.BaseCheckpointLoader]
default: null
status: prototype
checkpoint_format:
annotation: Optional[str]
default: null
status: prototype
mm_encoder_only:
annotation: bool
default: False
status: prototype
disable_overlap_scheduler:
annotation: bool
default: False
status: beta
disable_flashinfer_sampling:
annotation: bool
default: True
status: prototype
moe_config:
annotation: tensorrt_llm.llmapi.llm_args.MoeConfig
status: beta
default: null
attn_backend:
annotation: str
default: TRTLLM
status: beta
sampler_type:
annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType]
default: auto
status: beta
enable_iter_perf_stats:
annotation: bool
default: False
status: prototype
enable_iter_req_stats:
annotation: bool
default: False
status: prototype
batch_wait_timeout_ms:
annotation: float
default: 0
status: prototype
batch_wait_timeout_iters:
annotation: int
default: 0
status: prototype
batch_wait_max_tokens_ratio:
annotation: float
default: 0
status: prototype
print_iter_log:
annotation: bool
default: False
status: beta
perf_metrics_max_requests:
annotation: int
default: 0
status: prototype
torch_compile_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
default: null
status: prototype
enable_autotuner:
annotation: bool
default: True
status: prototype
enable_layerwise_nvtx_marker:
annotation: bool
default: False
status: beta
enable_min_latency:
annotation: bool
default: False
status: beta
force_dynamic_quantization:
annotation: bool
default: False
status: prototype
allreduce_strategy:
annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']]
default: AUTO
status: beta
decoding_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
default: null
status: deprecated
kv_connector_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConnectorConfig]
default: null
status: prototype
enable_lm_head_tp_in_adp:
annotation: bool
default: False
status: prototype
orchestrator_type:
annotation: Optional[Literal["rpc", "ray"]]
default: null
status: prototype
sparse_attention_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.SparseAttentionConfig]
default: null
status: prototype
otlp_traces_endpoint:
annotation: Optional[str]
default: null
status: prototype
ray_worker_extension_cls:
annotation: Optional[str]
default: null
status: prototype
enable_sleep:
annotation: bool
default: False
status: prototype
return_annotation: None
generate:
parameters:
disaggregated_params:
annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
default: null
kv_cache_retention_config:
annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
default: null
scheduling_params:
annotation: Union[tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], NoneType]
default: null
cache_salt:
annotation: Union[str, Sequence[str], NoneType]
default: null
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
generate_async:
parameters:
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
kv_cache_retention_config:
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
default: null
scheduling_params:
annotation: Optional[tensorrt_llm.scheduling_params.SchedulingParams]
default: null
status: prototype
cache_salt:
annotation: Optional[str]
default: null
trace_headers:
annotation: Optional[Mapping[str, str]]
default: null
status: prototype
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
get_kv_cache_events:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: List[dict]
status: beta
get_kv_cache_events_async:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: tensorrt_llm.executor.result.IterationResult
status: beta
get_stats:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: List[dict]
status: beta
get_stats_async:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: tensorrt_llm.executor.result.IterationResult
status: beta
shutdown:
parameters: {}
return_annotation: None
status: beta
properties:
llm_id:
annotation: str
default: inspect._empty