TensorRT-LLMs/tests/unittest/api_stability/references/llm.yaml

methods:
  __init__:
    parameters:
     # Parallelism
      cp_config:
        annotation: Optional[dict]
        default: null
      auto_parallel:
        annotation: bool
        default: false
      auto_parallel_world_size:
        annotation: Optional[int]
        default: null
      embedding_parallel_mode:
        annotation: str
        default: SHARDING_ALONG_VOCAB
      moe_cluster_parallel_size:
        annotation: Optional[int]
        default: null
      # Engine building
      build_config:
        annotation: Optional[tensorrt_llm.builder.BuildConfig]
        default: null
      enable_build_cache:
        annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
        default: false
      fast_build:
        annotation: bool
        default: false
      # Bindings and mirrored configs
      batching_type:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
        default: null
      peft_cache_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
        default: null
      scheduler_config:
        annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
        default: null
      extended_runtime_perf_knob_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
        default: null
      decoding_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
        default: null
      cache_transceiver_config:
        annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
        default: null
      # Misc
      backend:
        annotation: Optional[str]
        default: null
      enable_attention_dp:
        annotation: bool
        default: false
      normalize_log_probs:
        annotation: bool
        default: false
      gather_generation_logits:
        annotation: bool
        default: false
      gpus_per_node:
        annotation: Optional[int]
        default: null
      iter_stats_max_iterations:
        annotation: Optional[int]
        default: null
      request_stats_max_iterations:
        annotation: Optional[int]
        default: null
      workspace:
        annotation: Optional[str]
        default: null
      # LoRA
      max_lora_rank:
        annotation: Optional[int]
        default: null
      max_loras:
        annotation: int
        default: 4
      max_cpu_loras:
        annotation: int
        default: 4
      # postproc worker
      num_postprocess_workers:
        annotation: int
        default: 0
      postprocess_tokenizer_dir:
        annotation: Optional[str]
        default: null
      # reasoning
      reasoning_parser:
        annotation: Optional[str]
        default: null
      # kwargs
      kwargs:
        annotation: Any
        default: inspect._empty
    return_annotation: None
  generate:
    parameters:
      disaggregated_params:
        annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
        default: null
      kv_cache_retention_config:
        annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
        default: null
    return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
  generate_async:
    parameters:
      disaggregated_params:
        annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
        default: null
      kv_cache_retention_config:
        annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
        default: null
    return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
  get_kv_cache_events:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: List[dict]
  get_kv_cache_events_async:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: tensorrt_llm.executor.result.IterationResult
  get_stats:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: List[dict]
  get_stats_async:
    parameters:
      timeout:
        annotation: Optional[float]
        default: 2
    return_annotation: tensorrt_llm.executor.result.IterationResult
  save:
    parameters:
      engine_dir:
        annotation: str
        default: inspect._empty
    return_annotation: None
  shutdown:
    parameters: {}
    return_annotation: None
properties:
  workspace:
    annotation: pathlib.Path
    default: inspect._empty
  llm_id:
    annotation: str
    default: inspect._empty