mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-03 17:52:19 +08:00
* committed APIs validation Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * clean name Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * separate Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * add TODOs Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix naming Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --------- Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
97 lines
2.8 KiB
YAML
97 lines
2.8 KiB
YAML
methods:
|
|
__init__:
|
|
parameters:
|
|
# Explicit arguments
|
|
model:
|
|
annotation: Union[str, pathlib.Path]
|
|
default: inspect._empty
|
|
tokenizer:
|
|
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
|
|
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
|
|
default: null
|
|
tokenizer_mode:
|
|
annotation: Literal['auto', 'slow']
|
|
default: auto
|
|
skip_tokenizer_init:
|
|
annotation: bool
|
|
default: false
|
|
trust_remote_code:
|
|
annotation: bool
|
|
default: false
|
|
tensor_parallel_size:
|
|
annotation: int
|
|
default: 1
|
|
dtype:
|
|
annotation: str
|
|
default: auto
|
|
revision:
|
|
annotation: Optional[str]
|
|
default: null
|
|
tokenizer_revision:
|
|
annotation: Optional[str]
|
|
default: null
|
|
# Parallelism
|
|
pipeline_parallel_size:
|
|
annotation: int
|
|
default: 1
|
|
context_parallel_size:
|
|
annotation: int
|
|
default: 1
|
|
moe_tensor_parallel_size:
|
|
annotation: Optional[int]
|
|
default: null
|
|
moe_expert_parallel_size:
|
|
annotation: Optional[int]
|
|
default: null
|
|
# LoRA
|
|
enable_lora:
|
|
annotation: bool
|
|
default: false
|
|
max_lora_rank:
|
|
annotation: Optional[int]
|
|
default: null
|
|
max_loras:
|
|
annotation: int
|
|
default: 4
|
|
max_cpu_loras:
|
|
annotation: int
|
|
default: 4
|
|
# Prompt tuning
|
|
enable_prompt_adapter:
|
|
annotation: bool
|
|
default: false
|
|
max_prompt_adapter_token:
|
|
annotation: int
|
|
default: 0
|
|
# Logits processor and guided decoding
|
|
batched_logits_processor:
|
|
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
|
|
default: null
|
|
guided_decoding_backend:
|
|
annotation: Optional[str]
|
|
default: null
|
|
# Quantization and calibration
|
|
quant_config:
|
|
annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
|
|
default: null
|
|
calib_config:
|
|
annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
|
|
default: null
|
|
# Speculative decoding
|
|
speculative_config:
|
|
annotation: Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_utils.MedusaDecodingConfig,
|
|
tensorrt_llm.llmapi.llm_utils.EagleDecodingConfig, tensorrt_llm.llmapi.MTPDecodingConfig, NoneType]
|
|
default: null
|
|
# Misc
|
|
load_format:
|
|
annotation: Literal['auto', 'dummy']
|
|
default: auto
|
|
enable_tqdm:
|
|
annotation: bool
|
|
default: false
|
|
enable_chunked_prefill:
|
|
annotation: bool
|
|
default: false
|
|
return_annotation: None
|
|
properties: {}
|