TensorRT-LLMs/tests/unittest/api_stability/references/llm.yaml

methods:
  __init__:
    parameters:
      # Parallelism
      gpus_per_node:
        annotation: Optional[int]
        default: null
        status: beta
      moe_cluster_parallel_size:
        annotation: Optional[int]
        default: null
        status: beta
      enable_attention_dp:
        annotation: bool
        default: False
        status: beta
      cp_config:
        annotation: Optional[dict]
        default: null
        status: prototype
      pp_partition:
        annotation: Optional[List[int]]
        default: null
        status: prototype
      # Stats
      iter_stats_max_iterations:
        annotation: Optional[int]
        default: null
        status: prototype
      request_stats_max_iterations:
        annotation: Optional[int]
        default: null
        status: prototype
      return_perf_metrics:
        annotation: bool
        default: False
        status: prototype
      # Bindings and mirrored configs
      peft_cache_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
        default: null
        status: prototype
      scheduler_config:
        annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
        default: null
        status: prototype
      cache_transceiver_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
        default: null
        status: prototype
      gather_generation_logits:
        annotation: bool
        default: False
        status: prototype
      num_postprocess_workers:
        annotation: int
        default: 0
        status: prototype
      postprocess_tokenizer_dir:
        annotation: Optional[str]
        default: null
        status: prototype
      # reasoning
      reasoning_parser:
        annotation: Optional[str]
        default: null
        status: prototype
      # Runtime behavior
      fail_fast_on_attention_window_too_large:
        annotation: bool
        default: false
        status: prototype
      garbage_collection_gen0_threshold:
        annotation: int
        default: 20000
        status: beta
      # Misc
      backend:
        annotation: Optional[str]
        default: null
        status: deprecated
      build_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
        default: null
        status: deprecated
      cuda_graph_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig]
        default: null
        status: beta
      attention_dp_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.AttentionDpConfig]
        default: null
        status: beta
      checkpoint_loader:
        annotation: Optional[tensorrt_llm._torch.models.checkpoints.BaseCheckpointLoader]
        default: null
        status: prototype
      checkpoint_format:
        annotation: Optional[str]
        default: null
        status: prototype
      mm_encoder_only:
        annotation: bool
        default: False
        status: prototype
      disable_overlap_scheduler:
        annotation: bool
        default: False
        status: beta
      disable_flashinfer_sampling:
        annotation: bool
        default: True
        status: prototype
      moe_config:
        annotation: tensorrt_llm.llmapi.llm_args.MoeConfig
        status: beta
        default: null
      attn_backend:
        annotation: str
        default: TRTLLM
        status: beta
      sampler_type:
        annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType]
        default: auto
        status: beta
      enable_iter_perf_stats:
        annotation: bool
        default: False
        status: prototype
      enable_iter_req_stats:
        annotation: bool
        default: False
        status: prototype
      batch_wait_timeout_ms:
        annotation: float
        default: 0
        status: prototype
      batch_wait_timeout_iters:
        annotation: int
        default: 0
        status: prototype
      batch_wait_max_tokens_ratio:
        annotation: float
        default: 0
        status: prototype
      print_iter_log:
        annotation: bool
        default: False
        status: beta
      perf_metrics_max_requests:
        annotation: int
        default: 0
        status: prototype
      torch_compile_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
        default: null
        status: prototype
      enable_autotuner:
        annotation: bool
        default: True
        status: prototype
      enable_layerwise_nvtx_marker:
        annotation: bool
        default: False
        status: beta
      enable_min_latency:
        annotation: bool
        default: False
        status: beta
      force_dynamic_quantization:
        annotation: bool
        default: False
        status: prototype
      allreduce_strategy:
        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']]
        default: AUTO
        status: beta
      decoding_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
        default: null
        status: deprecated
      kv_connector_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConnectorConfig]
        default: null
        status: prototype
      enable_lm_head_tp_in_adp:
        annotation: bool
        default: False
        status: prototype
      orchestrator_type:
        annotation: Optional[Literal["rpc", "ray"]]
        default: null
        status: prototype
      sparse_attention_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.SparseAttentionConfig]
        default: null
        status: prototype
      otlp_traces_endpoint:
        annotation: Optional[str]
        default: null
        status: prototype
      ray_worker_extension_cls:
        annotation: Optional[str]
        default: null
        status: prototype
      enable_sleep:
        annotation: bool
        default: False
        status: prototype
    return_annotation: None
  generate:
    parameters:
      disaggregated_params:
        annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
        default: null
      kv_cache_retention_config:
        annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
        default: null
      scheduling_params:
        annotation: Union[tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], NoneType]
        default: null
      cache_salt:
        annotation: Union[str, Sequence[str], NoneType]
        default: null
    return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
  generate_async:
    parameters:
      disaggregated_params:
        annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
        default: null
      kv_cache_retention_config:
        annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
        default: null
      scheduling_params:
        annotation: Optional[tensorrt_llm.scheduling_params.SchedulingParams]
        default: null
        status: prototype
      cache_salt:
        annotation: Optional[str]
        default: null
      trace_headers:
        annotation: Optional[Mapping[str, str]]
        default: null
        status: prototype
    return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
  get_kv_cache_events:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: List[dict]
    status: beta
  get_kv_cache_events_async:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: tensorrt_llm.executor.result.IterationResult
    status: beta
  get_stats:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: List[dict]
    status: beta
  get_stats_async:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: tensorrt_llm.executor.result.IterationResult
    status: beta
  shutdown:
    parameters: {}
    return_annotation: None
    status: beta
properties:
  llm_id:
    annotation: str
    default: inspect._empty