mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-08 04:01:51 +08:00
158 lines
4.6 KiB
YAML
158 lines
4.6 KiB
YAML
methods:
|
|
__init__:
|
|
parameters:
|
|
# Parallelism
|
|
cp_config:
|
|
annotation: Optional[dict]
|
|
default: null
|
|
auto_parallel:
|
|
annotation: bool
|
|
default: false
|
|
auto_parallel_world_size:
|
|
annotation: Optional[int]
|
|
default: null
|
|
embedding_parallel_mode:
|
|
annotation: str
|
|
default: SHARDING_ALONG_VOCAB
|
|
moe_cluster_parallel_size:
|
|
annotation: Optional[int]
|
|
default: null
|
|
# Engine building
|
|
build_config:
|
|
annotation: Optional[tensorrt_llm.builder.BuildConfig]
|
|
default: null
|
|
enable_build_cache:
|
|
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
|
|
default: false
|
|
fast_build:
|
|
annotation: bool
|
|
default: false
|
|
# Bindings and mirrored configs
|
|
batching_type:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
|
|
default: null
|
|
peft_cache_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
|
|
default: null
|
|
scheduler_config:
|
|
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
|
|
default: null
|
|
extended_runtime_perf_knob_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
|
|
default: null
|
|
decoding_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
|
|
default: null
|
|
cache_transceiver_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
|
|
default: null
|
|
# Misc
|
|
backend:
|
|
annotation: Optional[str]
|
|
default: null
|
|
enable_attention_dp:
|
|
annotation: bool
|
|
default: false
|
|
normalize_log_probs:
|
|
annotation: bool
|
|
default: false
|
|
gather_generation_logits:
|
|
annotation: bool
|
|
default: false
|
|
gpus_per_node:
|
|
annotation: Optional[int]
|
|
default: null
|
|
iter_stats_max_iterations:
|
|
annotation: Optional[int]
|
|
default: null
|
|
request_stats_max_iterations:
|
|
annotation: Optional[int]
|
|
default: null
|
|
workspace:
|
|
annotation: Optional[str]
|
|
default: null
|
|
# LoRA
|
|
max_lora_rank:
|
|
annotation: Optional[int]
|
|
default: null
|
|
max_loras:
|
|
annotation: int
|
|
default: 4
|
|
max_cpu_loras:
|
|
annotation: int
|
|
default: 4
|
|
# postproc worker
|
|
num_postprocess_workers:
|
|
annotation: int
|
|
default: 0
|
|
postprocess_tokenizer_dir:
|
|
annotation: Optional[str]
|
|
default: null
|
|
# reasoning
|
|
reasoning_parser:
|
|
annotation: Optional[str]
|
|
default: null
|
|
# kwargs
|
|
kwargs:
|
|
annotation: Any
|
|
default: inspect._empty
|
|
return_annotation: None
|
|
generate:
|
|
parameters:
|
|
disaggregated_params:
|
|
annotation: Union[tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], NoneType]
|
|
default: null
|
|
kv_cache_retention_config:
|
|
annotation: Union[tensorrt_llm.bindings.executor.KvCacheRetentionConfig, Sequence[tensorrt_llm.bindings.executor.KvCacheRetentionConfig], NoneType]
|
|
default: null
|
|
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
|
|
generate_async:
|
|
parameters:
|
|
disaggregated_params:
|
|
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
|
default: null
|
|
kv_cache_retention_config:
|
|
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
|
|
default: null
|
|
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
|
|
get_kv_cache_events:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: List[dict]
|
|
get_kv_cache_events_async:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: tensorrt_llm.executor.result.IterationResult
|
|
get_stats:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: List[dict]
|
|
get_stats_async:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: 2
|
|
return_annotation: tensorrt_llm.executor.result.IterationResult
|
|
save:
|
|
parameters:
|
|
engine_dir:
|
|
annotation: str
|
|
default: inspect._empty
|
|
return_annotation: None
|
|
shutdown:
|
|
parameters: {}
|
|
return_annotation: None
|
|
properties:
|
|
workspace:
|
|
annotation: pathlib.Path
|
|
default: inspect._empty
|
|
llm_id:
|
|
annotation: str
|
|
default: inspect._empty
|