methods: __init__: parameters: # Parallelism gpus_per_node: annotation: Optional[int] default: null status: beta moe_cluster_parallel_size: annotation: Optional[int] default: null status: beta enable_attention_dp: annotation: bool default: False status: beta cp_config: annotation: Optional[dict] default: null status: prototype pp_partition: annotation: Optional[List[int]] default: null status: prototype # Stats iter_stats_max_iterations: annotation: Optional[int] default: null status: prototype request_stats_max_iterations: annotation: Optional[int] default: null status: prototype return_perf_metrics: annotation: bool default: False status: prototype # Bindings and mirrored configs peft_cache_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig] default: null status: prototype scheduler_config: annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig default: null status: prototype cache_transceiver_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig] default: null status: prototype gather_generation_logits: annotation: bool default: False status: prototype num_postprocess_workers: annotation: int default: 0 status: prototype postprocess_tokenizer_dir: annotation: Optional[str] default: null status: prototype custom_tokenizer: annotation: Optional[str] default: null status: prototype # reasoning reasoning_parser: annotation: Optional[str] default: null status: prototype # Runtime behavior fail_fast_on_attention_window_too_large: annotation: bool default: false status: prototype garbage_collection_gen0_threshold: annotation: int default: 20000 status: beta # Misc backend: annotation: Optional[str] default: null status: deprecated build_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig] default: null status: deprecated cuda_graph_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig] default: null status: beta attention_dp_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.AttentionDpConfig] default: null status: beta checkpoint_loader: annotation: Optional[tensorrt_llm._torch.models.checkpoints.BaseCheckpointLoader] default: null status: prototype checkpoint_format: annotation: Optional[str] default: null status: prototype mm_encoder_only: annotation: bool default: False status: prototype disable_overlap_scheduler: annotation: bool default: False status: beta disable_flashinfer_sampling: annotation: bool default: False status: prototype moe_config: annotation: tensorrt_llm.llmapi.llm_args.MoeConfig status: beta default: null nvfp4_gemm_config: annotation: tensorrt_llm.llmapi.llm_args.Nvfp4GemmConfig status: beta default: null attn_backend: annotation: str default: TRTLLM status: beta sampler_type: annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType] default: auto status: beta sampler_force_async_worker: annotation: bool default: False status: prototype enable_iter_perf_stats: annotation: bool default: False status: prototype enable_iter_req_stats: annotation: bool default: False status: prototype batch_wait_timeout_ms: annotation: float default: 0 status: prototype batch_wait_timeout_iters: annotation: int default: 0 status: prototype batch_wait_max_tokens_ratio: annotation: float default: 0 status: prototype print_iter_log: annotation: bool default: False status: beta perf_metrics_max_requests: annotation: int default: 0 status: prototype torch_compile_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig] default: null status: prototype enable_autotuner: annotation: bool default: True status: prototype enable_layerwise_nvtx_marker: annotation: bool default: False status: beta enable_min_latency: annotation: bool default: False status: beta force_dynamic_quantization: annotation: bool default: False status: prototype allreduce_strategy: annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']] default: AUTO status: beta decoding_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig] default: null status: deprecated kv_connector_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConnectorConfig] default: null status: prototype enable_lm_head_tp_in_adp: annotation: bool default: False status: prototype orchestrator_type: annotation: Optional[Literal["rpc", "ray"]] default: null status: prototype sparse_attention_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.SparseAttentionConfig] default: null status: prototype otlp_traces_endpoint: annotation: Optional[str] default: null status: prototype ray_worker_extension_cls: annotation: Optional[str] default: null status: prototype ray_placement_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.RayPlacementConfig] default: null status: prototype enable_sleep: annotation: bool default: False status: prototype env_overrides: annotation: Optional[Dict[str, str]] default: null status: prototype return_annotation: None generate: parameters: disaggregated_params: annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType] default: null kv_cache_retention_config: annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType] default: null scheduling_params: annotation: Union[tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], NoneType] default: null cache_salt: annotation: Union[str, Sequence[str], NoneType] default: null return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]] generate_async: parameters: disaggregated_params: annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams] default: null kv_cache_retention_config: annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig] default: null scheduling_params: annotation: Optional[tensorrt_llm.scheduling_params.SchedulingParams] default: null status: prototype cache_salt: annotation: Optional[str] default: null trace_headers: annotation: Optional[Mapping[str, str]] default: null status: prototype return_annotation: tensorrt_llm.llmapi.llm.RequestOutput get_kv_cache_events: parameters: timeout: annotation: Optional[float] default: 2 return_annotation: List[dict] status: beta get_kv_cache_events_async: parameters: timeout: annotation: Optional[float] default: 2 return_annotation: tensorrt_llm.executor.result.IterationResult status: beta get_stats: parameters: timeout: annotation: Optional[float] default: 2 return_annotation: List[dict] status: beta get_stats_async: parameters: timeout: annotation: Optional[float] default: 2 return_annotation: tensorrt_llm.executor.result.IterationResult status: beta shutdown: parameters: {} return_annotation: None status: beta properties: llm_id: annotation: str default: inspect._empty