test: [TRTLLM-4334] Create 1.0 criteria scope from API stability references (#3069)

* committed APIs validation

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* clean name

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* separate

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* add TODOs

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix naming

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

---------

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
Enwei Zhu 2025-03-26 18:14:35 +08:00 committed by GitHub
parent ea3739ee62
commit 224469b096
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 509 additions and 632 deletions

View File

@ -17,7 +17,6 @@ class LLM(BaseLLM):
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
dtype: str = "auto",
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
@ -26,6 +25,5 @@ class LLM(BaseLLM):
kwargs_dict = dict(kwargs)
kwargs_dict['backend'] = 'pytorch'
super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
trust_remote_code, tensor_parallel_size,
pipeline_parallel_size, dtype, revision,
tokenizer_revision, **kwargs_dict)
trust_remote_code, tensor_parallel_size, dtype,
revision, tokenizer_revision, **kwargs_dict)

View File

@ -98,7 +98,6 @@ class LLM:
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
dtype: str = "auto",
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
@ -116,7 +115,6 @@ class LLM:
skip_tokenizer_init=skip_tokenizer_init,
trust_remote_code=trust_remote_code,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
dtype=dtype,
revision=revision,
tokenizer_revision=tokenizer_revision,

View File

@ -650,8 +650,6 @@ LLMARGS_EXPLICIT_DOCSTRING = """
tensor_parallel_size(int): The number of processes for tensor parallelism. Defaults to 1.
pipeline_parallel_size(int): The number of processes for pipeline parallelism. Defaults to 1.
dtype (str): The data type for the model weights and activations. Defaults to "auto".
Can be "float16", "bfloat16", "float32", or "auto". If "auto", the data type will be automatically inferred from the source model.
If the source data type is "float32", it will be converted to "float16".
@ -662,6 +660,8 @@ LLMARGS_EXPLICIT_DOCSTRING = """
"""
LLMARGS_IMPLICIT_DOCSTRING = """
pipeline_parallel_size(int): The number of processes for pipeline parallelism. Defaults to 1.
context_parallel_size (int): The context parallel size. Defaults to 1.
gpus_per_node (int, optional): The number of GPUs per node. None means automatic configure. Defaults to None.
@ -769,8 +769,6 @@ class LlmArgs:
tensor_parallel_size: int = 1
pipeline_parallel_size: int = 1
dtype: str = "auto"
revision: Optional[str] = None
@ -778,6 +776,8 @@ class LlmArgs:
tokenizer_revision: Optional[str] = None
# Below are all remaining arguments
pipeline_parallel_size: int = 1
context_parallel_size: int = 1
gpus_per_node: Optional[int] = None

View File

@ -1,4 +1,5 @@
# autoflake: skip_file
import copy
import inspect
import os
import pathlib
@ -28,13 +29,12 @@ def repr_annotation(field_type: type) -> str:
@dataclass(slots=True)
class ParamSnapshot:
name: str
annotation: type
default: Any = None
@classmethod
def from_inspect(cls, param: inspect.Parameter):
return cls(param.name, param.annotation, param.default)
return cls(param.annotation, param.default)
@classmethod
def from_docstring(cls, param: docstring_parser.common.DocstringParam):
@ -57,7 +57,7 @@ class ParamSnapshot:
except (NameError, SyntaxError):
default = param.default
return cls(param.arg_name, annotation, default)
return cls(annotation, default)
@classmethod
def from_dict(cls, d: dict):
@ -77,19 +77,17 @@ class ParamSnapshot:
return d
def assert_equal(self, other: 'ParamSnapshot'):
assert self.name == other.name
assert self.annotation == other.annotation
assert self.default == other.default
@dataclass(slots=True)
class MethodSnapshot:
name: str
parameters: Dict[str, ParamSnapshot]
return_annotation: type
@classmethod
def from_inspect(cls, name: str, method: MethodType):
def from_inspect(cls, method: MethodType):
signature = inspect.signature(method)
parameters = {}
for param_name, param in signature.parameters.items():
@ -99,10 +97,10 @@ class MethodSnapshot:
return_annotation = signature.return_annotation
if isinstance(return_annotation, str):
return_annotation = eval(return_annotation)
return cls(name, parameters, return_annotation)
return cls(parameters, return_annotation)
@classmethod
def from_docstring(cls, name: str, method: MethodType):
def from_docstring(cls, method: MethodType):
doc = docstring_parser.parse(method.__doc__)
parameters = {}
for param in doc.params:
@ -112,7 +110,7 @@ class MethodSnapshot:
return_annotation = None
else:
return_annotation = eval(doc.returns.type_name)
return cls(name, parameters, return_annotation)
return cls(parameters, return_annotation)
@classmethod
def from_dict(cls, d: dict):
@ -132,13 +130,23 @@ class MethodSnapshot:
d['return_annotation'] = repr_annotation(d['return_annotation'])
return d
def merge(self, other: 'MethodSnapshot'):
assert self.parameters.keys().isdisjoint(other.parameters.keys())
self.parameters.update(copy.deepcopy(other.parameters))
assert self.return_annotation == other.return_annotation
def assert_equal(self, other: 'MethodSnapshot'):
assert self.name == other.name
assert self.parameters.keys() == other.parameters.keys()
for name, param in self.parameters.items():
param.assert_equal(other.parameters[name])
assert self.return_annotation == other.return_annotation
def assert_containing(self, other: 'MethodSnapshot'):
for name, param in other.parameters.items():
assert name in self.parameters
self.parameters[name].assert_equal(param)
assert self.return_annotation == other.return_annotation
@dataclass(slots=True)
class ClassSnapshot:
@ -153,16 +161,14 @@ class ClassSnapshot:
inst, predicate=inspect.ismethod):
if method_name.startswith("_") and method_name != "__init__":
continue
methods[method_name] = MethodSnapshot.from_inspect(
method_name, method)
methods[method_name] = MethodSnapshot.from_inspect(method)
properties = {}
for prop_name, prop in inspect.getmembers(
snapshot_cls, predicate=lambda x: isinstance(x, property)):
if prop_name.startswith("_"):
continue
annotation = inspect.signature(prop.fget).return_annotation
properties[prop_name] = ParamSnapshot(prop_name, annotation,
inspect._empty)
properties[prop_name] = ParamSnapshot(annotation, inspect._empty)
return cls(methods, properties)
@classmethod
@ -175,10 +181,9 @@ class ClassSnapshot:
continue
if method_name == "__init__":
methods["__init__"] = MethodSnapshot.from_docstring(
"__init__", snapshot_cls)
snapshot_cls)
else:
methods[method_name] = MethodSnapshot.from_docstring(
method_name, method)
methods[method_name] = MethodSnapshot.from_docstring(method)
properties = {}
doc = docstring_parser.parse(snapshot_cls.__doc__)
for param in doc.params:
@ -210,6 +215,19 @@ class ClassSnapshot:
}
return d
def merge(self, other: 'ClassSnapshot'):
for name, method in self.methods.items():
if name in other.methods:
method.merge(other.methods[name])
new_methods = {
name: method
for name, method in other.methods.items()
if name not in self.methods
}
self.methods.update(copy.deepcopy(new_methods))
assert self.properties.keys().isdisjoint(other.properties.keys())
self.properties.update(copy.deepcopy(other.properties))
def assert_equal(self, other: 'ClassSnapshot'):
assert self.methods.keys() == other.methods.keys()
for name, method in self.methods.items():
@ -218,30 +236,47 @@ class ClassSnapshot:
for name, prop in self.properties.items():
prop.assert_equal(other.properties[name])
def assert_containing(self, other: 'ClassSnapshot'):
for name, method in other.methods.items():
assert name in self.methods
self.methods[name].assert_containing(method)
for name, prop in other.properties.items():
assert name in self.properties
self.properties[name].assert_equal(prop)
class ApiStabilityTestHarness:
TEST_CLASS = None
REFERENCE_COMMITTED_DIR = f"{os.path.dirname(__file__)}/references_committed"
REFERENCE_DIR = f"{os.path.dirname(__file__)}/references"
REFERENCE_FILE = None
@classmethod
def reference_path(cls):
return f"{cls.REFERENCE_DIR}/{cls.REFERENCE_FILE}"
@classmethod
def setup_class(cls):
with open(cls.reference_path()) as f:
with open(f"{cls.REFERENCE_DIR}/{cls.REFERENCE_FILE}") as f:
cls.reference = ClassSnapshot.from_dict(yaml.safe_load(f))
cls.error_msg = (
f"API stability validation failed. "
f"This is probably because you changed {cls.TEST_CLASS.__name__}'s APIs, please ask for reviews from the code owners."
)
if os.path.exists(
f"{cls.REFERENCE_COMMITTED_DIR}/{cls.REFERENCE_FILE}"):
with open(
f"{cls.REFERENCE_COMMITTED_DIR}/{cls.REFERENCE_FILE}") as f:
cls.reference_committed = ClassSnapshot.from_dict(
yaml.safe_load(f))
cls.reference.merge(cls.reference_committed)
else:
cls.reference_committed = None
cls.error_msg = f"API validation failed because you changed {cls.TEST_CLASS.__name__}'s APIs, please ask for reviews from the code owners."
cls.error_msg_committed = f"API validation failed because you changed {cls.TEST_CLASS.__name__}'s committed APIs, please ask for approval."
def create_snapshot_from_inspect(self):
return ClassSnapshot.from_inspect(self.TEST_CLASS)
def test_signature(self):
snapshot = self.create_snapshot_from_inspect()
if self.reference_committed is not None:
try:
snapshot.assert_containing(self.reference_committed)
except AssertionError as e:
raise AssertionError(self.error_msg_committed) from e
try:
snapshot.assert_equal(self.reference)
except AssertionError as e:
@ -252,6 +287,11 @@ class ApiStabilityTestHarness:
def test_docstring(self):
snapshot = self.create_snapshot_from_docstring()
if self.reference_committed is not None:
try:
snapshot.assert_containing(self.reference_committed)
except AssertionError as e:
raise AssertionError(self.error_msg_committed) from e
try:
snapshot.assert_equal(self.reference)
except AssertionError as e:

View File

@ -1,26 +1,20 @@
methods:
__call__:
name: __call__
parameters:
client_ids:
annotation: List[Optional[int]]
default: inspect._empty
name: client_ids
logits:
annotation: List[torch.Tensor]
default: inspect._empty
name: logits
req_ids:
annotation: List[int]
default: inspect._empty
name: req_ids
stream_ptr:
annotation: int
default: inspect._empty
name: stream_ptr
token_ids:
annotation: List[List[List[int]]]
default: inspect._empty
name: token_ids
return_annotation: None
properties: {}

View File

@ -1,46 +1,35 @@
methods:
__init__:
name: __init__
parameters:
calib_batch_size:
annotation: int
default: 1
name: calib_batch_size
calib_batches:
annotation: int
default: 512
name: calib_batches
calib_dataset:
annotation: str
default: cnn_dailymail
name: calib_dataset
calib_max_seq_length:
annotation: int
default: 512
name: calib_max_seq_length
device:
annotation: Literal['cuda', 'cpu']
default: cuda
name: device
random_seed:
annotation: int
default: 1234
name: random_seed
tokenizer_max_seq_length:
annotation: int
default: 2048
name: tokenizer_max_seq_length
return_annotation: None
from_dict:
name: from_dict
parameters:
config:
annotation: dict
default: inspect._empty
name: config
return_annotation: tensorrt_llm.llmapi.llm_utils.CalibConfig
to_dict:
name: to_dict
parameters: {}
return_annotation: dict
properties: {}

View File

@ -1,58 +1,26 @@
methods:
__init__:
name: __init__
parameters:
cumulative_logprob:
annotation: Optional[float]
default: null
name: cumulative_logprob
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
name: disaggregated_params
finish_reason:
annotation: Optional[Literal['stop', 'length', 'timeout', 'cancelled']]
default: null
name: finish_reason
generation_logits:
annotation: Optional[torch.Tensor]
default: null
name: generation_logits
index:
annotation: int
default: inspect._empty
name: index
logprobs:
annotation: Optional[List[float]]
default: null
name: logprobs
stop_reason:
annotation: Union[int, str, NoneType]
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
name: stop_reason
text:
annotation: str
default: ''
name: text
token_ids:
annotation: Optional[List[int]]
default: null
name: token_ids
return_annotation: None
properties:
length:
annotation: int
default: inspect._empty
name: length
logprobs_diff:
annotation: List[float]
default: inspect._empty
name: logprobs_diff
text_diff:
annotation: str
default: inspect._empty
name: text_diff
token_ids_diff:
annotation: List[int]
default: inspect._empty
name: token_ids_diff

View File

@ -1,22 +1,17 @@
methods:
__init__:
name: __init__
parameters:
grammar:
annotation: Optional[str]
default: null
name: grammar
json:
annotation: Union[str, pydantic.main.BaseModel, dict, NoneType]
default: null
name: json
json_object:
annotation: bool
default: false
name: json_object
regex:
annotation: Optional[str]
default: null
name: regex
return_annotation: None
properties: {}

View File

@ -1,182 +1,69 @@
methods:
__init__:
name: __init__
parameters:
dtype:
annotation: str
default: auto
name: dtype
kwargs:
annotation: Any
default: inspect._empty
name: kwargs
model:
annotation: Union[str, pathlib.Path]
default: inspect._empty
name: model
revision:
annotation: Optional[str]
default: null
name: revision
skip_tokenizer_init:
annotation: bool
default: false
name: skip_tokenizer_init
tensor_parallel_size:
annotation: int
default: 1
name: tensor_parallel_size
pipeline_parallel_size:
annotation: int
default: 1
name: pipeline_parallel_size
tokenizer:
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
default: null
name: tokenizer
tokenizer_mode:
annotation: Literal['auto', 'slow']
default: auto
name: tokenizer_mode
tokenizer_revision:
annotation: Optional[str]
default: null
name: tokenizer_revision
trust_remote_code:
annotation: bool
default: false
name: trust_remote_code
parameters: {}
return_annotation: None
generate:
name: generate
parameters:
# TODO [TRTLLM-3925]
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
name: disaggregated_params
inputs:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]]]
default: inspect._empty
name: inputs
kv_cache_retention_config:
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
default: null
name: kv_cache_retention_config
lora_request:
annotation: Union[tensorrt_llm.executor.request.LoRARequest, Sequence[tensorrt_llm.executor.request.LoRARequest],
NoneType]
default: null
name: lora_request
prompt_adapter_request:
annotation: Union[tensorrt_llm.executor.request.PromptAdapterRequest, Sequence[tensorrt_llm.executor.request.PromptAdapterRequest],
NoneType]
default: null
name: prompt_adapter_request
queries:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]],
NoneType]
default: null
name: queries
sampling_params:
annotation: Union[tensorrt_llm.sampling_params.SamplingParams, List[tensorrt_llm.sampling_params.SamplingParams],
NoneType]
default: null
name: sampling_params
use_tqdm:
annotation: bool
default: true
name: use_tqdm
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
generate_async:
name: generate_async
parameters:
disaggregated_params:
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
default: null
name: disaggregated_params
inputs:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]
default: inspect._empty
name: inputs
kv_cache_retention_config:
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
default: null
name: kv_cache_retention_config
lora_request:
annotation: Optional[tensorrt_llm.executor.request.LoRARequest]
default: null
name: lora_request
prompt_adapter_request:
annotation: Optional[tensorrt_llm.executor.request.PromptAdapterRequest]
default: null
name: prompt_adapter_request
queries:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
NoneType]
default: null
name: queries
sampling_params:
annotation: Optional[tensorrt_llm.sampling_params.SamplingParams]
default: null
name: sampling_params
streaming:
annotation: bool
default: false
name: streaming
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
get_kv_cache_events:
name: get_kv_cache_events
parameters:
timeout:
annotation: Optional[float]
default: 2
name: timeout
return_annotation: List[dict]
get_kv_cache_events_async:
name: get_kv_cache_events_async
parameters:
timeout:
annotation: Optional[float]
default: 2
name: timeout
return_annotation: tensorrt_llm.executor.result.IterationResult
get_stats:
name: get_stats
parameters:
timeout:
annotation: Optional[float]
default: 2
name: timeout
return_annotation: List[dict]
get_stats_async:
name: get_stats_async
parameters:
timeout:
annotation: Optional[float]
default: 2
name: timeout
return_annotation: tensorrt_llm.executor.result.IterationResult
save:
name: save
parameters:
engine_dir:
annotation: str
default: inspect._empty
name: engine_dir
return_annotation: None
shutdown:
name: shutdown
parameters: {}
return_annotation: None
properties:
tokenizer:
annotation: Optional[tensorrt_llm.llmapi.tokenizer.TokenizerBase]
default: inspect._empty
name: tokenizer
workspace:
annotation: pathlib.Path
default: inspect._empty
name: workspace

View File

@ -1,220 +1,87 @@
methods:
__init__:
name: __init__
parameters:
auto_parallel:
annotation: bool
default: false
name: auto_parallel
auto_parallel_world_size:
annotation: int
default: 1
name: auto_parallel_world_size
backend:
annotation: Optional[str]
default: null
name: backend
batched_logits_processor:
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
default: null
name: batched_logits_processor
batching_type:
annotation: Optional[tensorrt_llm.bindings.executor.BatchingType]
default: null
name: batching_type
build_config:
annotation: Optional[tensorrt_llm.builder.BuildConfig]
default: null
name: build_config
calib_config:
annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
default: null
name: calib_config
context_parallel_size:
annotation: int
default: 1
name: context_parallel_size
# Parallelism
cp_config:
annotation: Optional[dict]
default: null
name: cp_config
decoding_config:
annotation: Optional[tensorrt_llm.bindings.executor.DecodingConfig]
default: null
name: decoding_config
dtype:
annotation: str
default: auto
name: dtype
auto_parallel:
annotation: bool
default: false
auto_parallel_world_size:
annotation: int
default: 1
embedding_parallel_mode:
annotation: str
default: SHARDING_ALONG_VOCAB
name: embedding_parallel_mode
enable_attention_dp:
annotation: bool
default: false
name: enable_attention_dp
# Engine building
build_config:
annotation: Optional[tensorrt_llm.builder.BuildConfig]
default: null
enable_build_cache:
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
default: false
name: enable_build_cache
enable_chunked_prefill:
annotation: bool
default: false
name: enable_chunked_prefill
enable_lora:
annotation: bool
default: false
name: enable_lora
enable_prompt_adapter:
annotation: bool
default: false
name: enable_prompt_adapter
enable_tqdm:
annotation: bool
default: false
name: enable_tqdm
extended_runtime_perf_knob_config:
annotation: Optional[tensorrt_llm.bindings.executor.ExtendedRuntimePerfKnobConfig]
default: null
name: extended_runtime_perf_knob_config
fast_build:
annotation: bool
default: false
name: fast_build
gather_generation_logits:
annotation: bool
default: false
name: gather_generation_logits
gpus_per_node:
annotation: Optional[int]
# Bindings and mirrored configs
batching_type:
annotation: Optional[tensorrt_llm.bindings.executor.BatchingType]
default: null
name: gpus_per_node
guided_decoding_backend:
annotation: Optional[str]
default: null
name: guided_decoding_backend
iter_stats_max_iterations:
annotation: Optional[int]
default: null
name: iter_stats_max_iterations
kv_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConfig]
default: null
name: kv_cache_config
load_format:
annotation: Literal['auto', 'dummy']
default: auto
name: load_format
max_batch_size:
annotation: Optional[int]
default: null
name: max_batch_size
max_cpu_loras:
annotation: int
default: 4
name: max_cpu_loras
max_lora_rank:
annotation: Optional[int]
default: null
name: max_lora_rank
max_loras:
annotation: int
default: 4
name: max_loras
max_num_tokens:
annotation: Optional[int]
default: null
name: max_num_tokens
max_prompt_adapter_token:
annotation: int
default: 0
name: max_prompt_adapter_token
model:
annotation: Union[str, pathlib.Path]
default: inspect._empty
name: model
moe_expert_parallel_size:
annotation: Optional[int]
default: null
name: moe_expert_parallel_size
moe_tensor_parallel_size:
annotation: Optional[int]
default: null
name: moe_tensor_parallel_size
normalize_log_probs:
annotation: bool
default: false
name: normalize_log_probs
peft_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
default: null
name: peft_cache_config
pipeline_parallel_size:
annotation: int
default: 1
name: pipeline_parallel_size
quant_config:
annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
default: null
name: quant_config
request_stats_max_iterations:
annotation: Optional[int]
default: null
name: request_stats_max_iterations
revision:
annotation: Optional[str]
default: null
name: revision
scheduler_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.SchedulerConfig]
default: null
name: scheduler_config
skip_tokenizer_init:
annotation: bool
default: false
name: skip_tokenizer_init
speculative_config:
annotation: Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_utils.MedusaDecodingConfig,
tensorrt_llm.llmapi.llm_utils.EagleDecodingConfig, tensorrt_llm.llmapi.MTPDecodingConfig, NoneType]
extended_runtime_perf_knob_config:
annotation: Optional[tensorrt_llm.bindings.executor.ExtendedRuntimePerfKnobConfig]
default: null
name: speculative_config
tensor_parallel_size:
annotation: int
default: 1
name: tensor_parallel_size
tokenizer:
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
decoding_config:
annotation: Optional[tensorrt_llm.bindings.executor.DecodingConfig]
default: null
name: tokenizer
tokenizer_mode:
annotation: Literal['auto', 'slow']
default: auto
name: tokenizer_mode
tokenizer_revision:
# Misc
backend:
annotation: Optional[str]
default: null
name: tokenizer_revision
trust_remote_code:
max_batch_size:
annotation: Optional[int]
default: null
max_num_tokens:
annotation: Optional[int]
default: null
enable_attention_dp:
annotation: bool
default: false
name: trust_remote_code
normalize_log_probs:
annotation: bool
default: false
gather_generation_logits:
annotation: bool
default: false
gpus_per_node:
annotation: Optional[int]
default: null
iter_stats_max_iterations:
annotation: Optional[int]
default: null
request_stats_max_iterations:
annotation: Optional[int]
default: null
workspace:
annotation: Optional[str]
default: null
name: workspace
return_annotation: None
from_kwargs:
name: from_kwargs
parameters:
kwargs:
annotation: Any
default: inspect._empty
name: kwargs
return_annotation: tensorrt_llm.llmapi.llm_utils.LlmArgs
to_dict:
name: to_dict
parameters: {}
return_annotation: dict
properties: {}

View File

@ -1,26 +1,20 @@
methods:
__call__:
name: __call__
parameters:
client_id:
annotation: Optional[int]
default: inspect._empty
name: client_id
logits:
annotation: torch.Tensor
default: inspect._empty
name: logits
req_id:
annotation: int
default: inspect._empty
name: req_id
stream_ptr:
annotation: int
default: inspect._empty
name: stream_ptr
token_ids:
annotation: List[List[int]]
default: inspect._empty
name: token_ids
return_annotation: None
properties: {}

View File

@ -1,54 +1,41 @@
methods:
__init__:
name: __init__
parameters:
clamp_val:
annotation: Optional[List[float]]
default: null
name: clamp_val
exclude_modules:
annotation: Optional[List[str]]
default: null
name: exclude_modules
group_size:
annotation: int
default: 128
name: group_size
has_zero_point:
annotation: bool
default: false
name: has_zero_point
kv_cache_quant_algo:
annotation: Optional[tensorrt_llm.quantization.mode.QuantAlgo]
default: null
name: kv_cache_quant_algo
pre_quant_scale:
annotation: bool
default: false
name: pre_quant_scale
quant_algo:
annotation: Optional[tensorrt_llm.quantization.mode.QuantAlgo]
default: null
name: quant_algo
smoothquant_val:
annotation: float
default: 0.5
name: smoothquant_val
use_meta_recipe:
annotation: bool
default: false
name: use_meta_recipe
return_annotation: None
from_dict:
name: from_dict
parameters:
config:
annotation: dict
default: inspect._empty
name: config
return_annotation: tensorrt_llm.models.modeling_utils.QuantConfig
to_dict:
name: to_dict
parameters: {}
return_annotation: dict
properties: {}

View File

@ -1,50 +1,11 @@
methods:
__init__:
name: __init__
parameters: {}
return_annotation: None
abort:
name: abort
parameters: {}
return_annotation: None
aborted:
name: aborted
parameters: {}
return_annotation: bool
aresult:
name: aresult
parameters: {}
return_annotation: tensorrt_llm.executor.result.GenerationResult
result:
name: result
parameters:
timeout:
annotation: Optional[float]
default: None
name: timeout
return_annotation: tensorrt_llm.executor.result.GenerationResult
properties:
context_logits:
annotation: Optional[torch.Tensor]
default: inspect._empty
name: context_logits
finished:
annotation: bool
default: inspect._empty
name: finished
outputs:
annotation: List[tensorrt_llm.executor.result.CompletionOutput]
default: inspect._empty
name: outputs
prompt:
annotation: Optional[str]
default: inspect._empty
name: prompt
prompt_token_ids:
annotation: List[int]
default: inspect._empty
name: prompt_token_ids
request_id:
annotation: int
default: inspect._empty
name: request_id
properties: {}

View File

@ -1,206 +1,28 @@
methods:
__init__:
name: __init__
parameters:
add_special_tokens:
annotation: bool
default: true
name: add_special_tokens
# Experimental features
additional_model_outputs:
annotation: Optional[List[tensorrt_llm.sampling_params.AdditionalModelOutput]]
default: null
name: additional_model_outputs
apply_batched_logits_processor:
annotation: bool
default: false
name: apply_batched_logits_processor
bad:
annotation: Union[List[str], str, NoneType]
default: null
name: bad
bad_token_ids:
annotation: Optional[List[int]]
default: null
name: bad_token_ids
beam_search_diversity_rate:
annotation: Optional[float]
default: null
name: beam_search_diversity_rate
beam_width:
annotation: int
default: 1
name: beam_width
best_of:
annotation: Optional[int]
default: null
name: best_of
detokenize:
annotation: bool
default: true
name: detokenize
early_stopping:
annotation: Optional[int]
default: null
name: early_stopping
embedding_bias:
annotation: Optional[torch.Tensor]
default: null
name: embedding_bias
end_id:
annotation: Optional[int]
default: null
name: end_id
exclude_input_from_output:
annotation: bool
default: true
name: exclude_input_from_output
frequency_penalty:
annotation: Optional[float]
default: null
name: frequency_penalty
guided_decoding:
annotation: Optional[tensorrt_llm.sampling_params.GuidedDecodingParams]
default: null
name: guided_decoding
ignore_eos:
annotation: bool
default: false
name: ignore_eos
include_stop_str_in_output:
annotation: bool
default: false
name: include_stop_str_in_output
length_penalty:
annotation: Optional[float]
default: null
name: length_penalty
logits_processor:
annotation: Optional[tensorrt_llm.sampling_params.LogitsProcessor]
default: null
name: logits_processor
lookahead_config:
annotation: Optional[tensorrt_llm.bindings.executor.LookaheadDecodingConfig]
default: null
name: lookahead_config
max_new_tokens:
annotation: Optional[int]
default: null
name: max_new_tokens
max_tokens:
annotation: int
default: 32
name: max_tokens
min_length:
annotation: Optional[int]
default: null
name: min_length
min_p:
annotation: Optional[float]
default: null
name: min_p
min_tokens:
annotation: Optional[int]
default: null
name: min_tokens
n:
annotation: int
default: 1
name: n
no_repeat_ngram_size:
annotation: Optional[int]
default: null
name: no_repeat_ngram_size
num_return_sequences:
annotation: Optional[int]
default: null
name: num_return_sequences
pad_id:
annotation: Optional[int]
default: null
name: pad_id
presence_penalty:
annotation: Optional[float]
default: null
name: presence_penalty
random_seed:
annotation: Optional[int]
default: null
name: random_seed
repetition_penalty:
annotation: Optional[float]
default: null
name: repetition_penalty
return_context_logits:
annotation: bool
default: false
name: return_context_logits
return_encoder_output:
annotation: bool
default: false
name: return_encoder_output
return_generation_logits:
annotation: bool
default: false
name: return_generation_logits
return_log_probs:
annotation: bool
default: false
name: return_log_probs
return_perf_metrics:
annotation: bool
default: false
name: return_perf_metrics
seed:
# TODO [TRTLLM-3716]: Deprecated arguments
beam_width:
annotation: int
default: 1
max_new_tokens:
annotation: Optional[int]
default: null
name: seed
skip_special_tokens:
annotation: bool
default: true
name: skip_special_tokens
spaces_between_special_tokens:
annotation: bool
default: true
name: spaces_between_special_tokens
stop:
annotation: Union[List[str], str, NoneType]
default: null
name: stop
stop_token_ids:
annotation: Optional[List[int]]
default: null
name: stop_token_ids
temperature:
annotation: Optional[float]
default: null
name: temperature
top_k:
min_length:
annotation: Optional[int]
default: null
name: top_k
top_p:
annotation: Optional[float]
default: null
name: top_p
top_p_decay:
annotation: Optional[float]
default: null
name: top_p_decay
top_p_min:
annotation: Optional[float]
default: null
name: top_p_min
top_p_reset_ids:
num_return_sequences:
annotation: Optional[int]
default: null
name: top_p_reset_ids
truncate_prompt_tokens:
random_seed:
annotation: Optional[int]
default: null
name: truncate_prompt_tokens
use_beam_search:
annotation: bool
default: false
name: use_beam_search
return_annotation: None
properties: {}

View File

@ -0,0 +1,27 @@
methods:
__init__:
parameters:
index:
annotation: int
default: inspect._empty
text:
annotation: str
default: ''
token_ids:
annotation: Optional[List[int]]
default: null
finish_reason:
annotation: Optional[Literal['stop', 'length', 'timeout', 'cancelled']]
default: null
stop_reason:
annotation: Union[int, str, NoneType]
default: null
generation_logits:
annotation: Optional[torch.Tensor]
default: null
# TODO [TRTLLM-1049]
# logprobs:
# annotation: Optional[SampleLogprobs]
# default: null
return_annotation: None
properties: {}

View File

@ -0,0 +1,79 @@
methods:
__init__:
parameters:
model:
annotation: Union[str, pathlib.Path]
default: inspect._empty
tokenizer:
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
default: null
tokenizer_mode:
annotation: Literal['auto', 'slow']
default: auto
skip_tokenizer_init:
annotation: bool
default: false
trust_remote_code:
annotation: bool
default: false
tensor_parallel_size:
annotation: int
default: 1
dtype:
annotation: str
default: auto
revision:
annotation: Optional[str]
default: null
tokenizer_revision:
annotation: Optional[str]
default: null
kwargs:
annotation: Any
default: inspect._empty
return_annotation: None
generate:
parameters:
inputs:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]]]
default: inspect._empty
sampling_params:
annotation: Union[tensorrt_llm.sampling_params.SamplingParams, List[tensorrt_llm.sampling_params.SamplingParams],
NoneType]
default: null
lora_request:
annotation: Union[tensorrt_llm.executor.request.LoRARequest, Sequence[tensorrt_llm.executor.request.LoRARequest],
NoneType]
default: null
prompt_adapter_request:
annotation: Union[tensorrt_llm.executor.request.PromptAdapterRequest, Sequence[tensorrt_llm.executor.request.PromptAdapterRequest],
NoneType]
default: null
use_tqdm:
annotation: bool
default: true
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
generate_async:
parameters:
inputs:
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]
default: inspect._empty
sampling_params:
annotation: Optional[tensorrt_llm.sampling_params.SamplingParams]
default: null
lora_request:
annotation: Optional[tensorrt_llm.executor.request.LoRARequest]
default: null
prompt_adapter_request:
annotation: Optional[tensorrt_llm.executor.request.PromptAdapterRequest]
default: null
streaming:
annotation: bool
default: false
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
properties:
tokenizer:
annotation: Optional[tensorrt_llm.llmapi.tokenizer.TokenizerBase]
default: inspect._empty

View File

@ -0,0 +1,96 @@
methods:
__init__:
parameters:
# Explicit arguments
model:
annotation: Union[str, pathlib.Path]
default: inspect._empty
tokenizer:
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
default: null
tokenizer_mode:
annotation: Literal['auto', 'slow']
default: auto
skip_tokenizer_init:
annotation: bool
default: false
trust_remote_code:
annotation: bool
default: false
tensor_parallel_size:
annotation: int
default: 1
dtype:
annotation: str
default: auto
revision:
annotation: Optional[str]
default: null
tokenizer_revision:
annotation: Optional[str]
default: null
# Parallelism
pipeline_parallel_size:
annotation: int
default: 1
context_parallel_size:
annotation: int
default: 1
moe_tensor_parallel_size:
annotation: Optional[int]
default: null
moe_expert_parallel_size:
annotation: Optional[int]
default: null
# LoRA
enable_lora:
annotation: bool
default: false
max_lora_rank:
annotation: Optional[int]
default: null
max_loras:
annotation: int
default: 4
max_cpu_loras:
annotation: int
default: 4
# Prompt tuning
enable_prompt_adapter:
annotation: bool
default: false
max_prompt_adapter_token:
annotation: int
default: 0
# Logits processor and guided decoding
batched_logits_processor:
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
default: null
guided_decoding_backend:
annotation: Optional[str]
default: null
# Quantization and calibration
quant_config:
annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
default: null
calib_config:
annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
default: null
# Speculative decoding
speculative_config:
annotation: Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_utils.MedusaDecodingConfig,
tensorrt_llm.llmapi.llm_utils.EagleDecodingConfig, tensorrt_llm.llmapi.MTPDecodingConfig, NoneType]
default: null
# Misc
load_format:
annotation: Literal['auto', 'dummy']
default: auto
enable_tqdm:
annotation: bool
default: false
enable_chunked_prefill:
annotation: bool
default: false
return_annotation: None
properties: {}

View File

@ -0,0 +1,33 @@
methods:
aresult:
parameters: {}
return_annotation: tensorrt_llm.executor.result.GenerationResult
result:
parameters:
timeout:
annotation: Optional[float]
default: None
return_annotation: tensorrt_llm.executor.result.GenerationResult
properties:
request_id:
annotation: int
default: inspect._empty
prompt:
annotation: Optional[str]
default: inspect._empty
prompt_token_ids:
annotation: List[int]
default: inspect._empty
outputs:
annotation: List[tensorrt_llm.executor.result.CompletionOutput]
default: inspect._empty
context_logits:
annotation: Optional[torch.Tensor]
default: inspect._empty
finished:
annotation: bool
default: inspect._empty
# TODO [TRTLLM-876]
# prompt_logprobs:
# annotation: Optional[PromptLogprobs]
# default: inspect._empty

View File

@ -0,0 +1,142 @@
methods:
__init__:
parameters:
# General
n:
annotation: int
default: 1
best_of:
annotation: Optional[int]
default: null
use_beam_search:
annotation: bool
default: false
beam_search_diversity_rate:
annotation: Optional[float]
default: null
early_stopping:
annotation: Optional[int]
default: null
max_tokens:
annotation: int
default: 32
min_tokens:
annotation: Optional[int]
default: null
end_id:
annotation: Optional[int]
default: null
pad_id:
annotation: Optional[int]
default: null
# Sampling
seed:
annotation: Optional[int]
default: null
temperature:
annotation: Optional[float]
default: null
top_k:
annotation: Optional[int]
default: null
top_p:
annotation: Optional[float]
default: null
top_p_decay:
annotation: Optional[float]
default: null
top_p_min:
annotation: Optional[float]
default: null
top_p_reset_ids:
annotation: Optional[int]
default: null
min_p:
annotation: Optional[float]
default: null
# Panelities
repetition_penalty:
annotation: Optional[float]
default: null
presence_penalty:
annotation: Optional[float]
default: null
frequency_penalty:
annotation: Optional[float]
default: null
length_penalty:
annotation: Optional[float]
default: null
no_repeat_ngram_size:
annotation: Optional[int]
default: null
# Stop words and bad words
stop:
annotation: Union[List[str], str, NoneType]
default: null
stop_token_ids:
annotation: Optional[List[int]]
default: null
include_stop_str_in_output:
annotation: bool
default: false
bad:
annotation: Union[List[str], str, NoneType]
default: null
bad_token_ids:
annotation: Optional[List[int]]
default: null
# Logits processor and guided decoding
logits_processor:
annotation: Optional[tensorrt_llm.sampling_params.LogitsProcessor]
default: null
apply_batched_logits_processor:
annotation: bool
default: false
guided_decoding:
annotation: Optional[tensorrt_llm.sampling_params.GuidedDecodingParams]
default: null
embedding_bias:
annotation: Optional[torch.Tensor]
default: null
# Speculative decoding
lookahead_config:
annotation: Optional[tensorrt_llm.bindings.executor.LookaheadDecodingConfig]
default: null
# Tokenizer behavior
ignore_eos:
annotation: bool
default: false
detokenize:
annotation: bool
default: true
add_special_tokens:
annotation: bool
default: true
truncate_prompt_tokens:
annotation: Optional[int]
default: null
skip_special_tokens:
annotation: bool
default: true
spaces_between_special_tokens:
annotation: bool
default: true
# Returning controls
return_log_probs:
annotation: bool
default: false
return_context_logits:
annotation: bool
default: false
return_generation_logits:
annotation: bool
default: false
exclude_input_from_output:
annotation: bool
default: true
return_encoder_output:
annotation: bool
default: false
return_annotation: None
properties: {}

View File

@ -66,13 +66,13 @@ class TestLogitsProcessor(ApiStabilityTestHarness):
def create_snapshot_from_inspect(self):
method_snapshot = MethodSnapshot.from_inspect(
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
MethodType(self.TEST_CLASS.__call__, object()))
return ClassSnapshot(methods={"__call__": method_snapshot},
properties={})
def create_snapshot_from_docstring(self):
method_snapshot = MethodSnapshot.from_docstring(
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
MethodType(self.TEST_CLASS.__call__, object()))
return ClassSnapshot(methods={"__call__": method_snapshot},
properties={})
@ -83,13 +83,13 @@ class TestBatchedLogitsProcessor(ApiStabilityTestHarness):
def create_snapshot_from_inspect(self):
method_snapshot = MethodSnapshot.from_inspect(
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
MethodType(self.TEST_CLASS.__call__, object()))
return ClassSnapshot(methods={"__call__": method_snapshot},
properties={})
def create_snapshot_from_docstring(self):
method_snapshot = MethodSnapshot.from_docstring(
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
MethodType(self.TEST_CLASS.__call__, object()))
return ClassSnapshot(methods={"__call__": method_snapshot},
properties={})