TensorRT-LLMs/tests/unittest/api_stability/references/llm.yaml
Yan Chunwei ac20159d32
fix: build_config in TorchLlmArgs and avoid invalid args (#4600)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
2025-06-04 13:17:29 +08:00

158 lines
4.6 KiB
YAML

methods:
__init__:
parameters:
# Parallelism
cp_config:
annotation: Optional[dict]
default: null
auto_parallel:
annotation: bool
default: false
auto_parallel_world_size:
annotation: Optional[int]
default: null
embedding_parallel_mode:
annotation: str
default: SHARDING_ALONG_VOCAB
moe_cluster_parallel_size:
annotation: Optional[int]
default: null
# Engine building
build_config:
annotation: Optional[tensorrt_llm.builder.BuildConfig]
default: null
enable_build_cache:
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
default: false
fast_build:
annotation: bool
default: false
# Bindings and mirrored configs
batching_type:
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
default: null
peft_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
default: null
scheduler_config:
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
default: null
extended_runtime_perf_knob_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
default: null
decoding_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
default: null
cache_transceiver_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
default: null
# Misc
backend:
annotation: Optional[str]
default: null
enable_attention_dp:
annotation: bool
default: false
normalize_log_probs:
annotation: bool
default: false
gather_generation_logits:
annotation: bool
default: false
gpus_per_node:
annotation: Optional[int]
default: null
iter_stats_max_iterations:
annotation: Optional[int]
default: null
request_stats_max_iterations:
annotation: Optional[int]
default: null
workspace:
annotation: Optional[str]
default: null
# LoRA
max_lora_rank:
annotation: Optional[int]
default: null
max_loras:
annotation: int
default: 4
max_cpu_loras:
annotation: int
default: 4
# postproc worker
num_postprocess_workers:
annotation: int
default: 0
postprocess_tokenizer_dir:
annotation: Optional[str]
default: null
# reasoning
reasoning_parser:
annotation: Optional[str]
default: null
# kwargs
kwargs:
annotation: Any
default: inspect._empty
return_annotation: None
generate:
parameters:
disaggregated_params:
annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
default: null
kv_cache_retention_config:
annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
default: null
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
generate_async:
parameters:
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
kv_cache_retention_config:
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
default: null
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
get_kv_cache_events:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: List[dict]
get_kv_cache_events_async:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: tensorrt_llm.executor.result.IterationResult
get_stats:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: List[dict]
get_stats_async:
parameters:
timeout:
annotation: Optional[float]
default: 2
return_annotation: tensorrt_llm.executor.result.IterationResult
save:
parameters:
engine_dir:
annotation: str
default: inspect._empty
return_annotation: None
shutdown:
parameters: {}
return_annotation: None
properties:
workspace:
annotation: pathlib.Path
default: inspect._empty
llm_id:
annotation: str
default: inspect._empty