mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
test: [TRTLLM-4334] Create 1.0 criteria scope from API stability references (#3069)
* committed APIs validation Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * clean name Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * separate Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * add TODOs Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix naming Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> * fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --------- Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
parent
ea3739ee62
commit
224469b096
@ -17,7 +17,6 @@ class LLM(BaseLLM):
|
||||
skip_tokenizer_init: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
dtype: str = "auto",
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
@ -26,6 +25,5 @@ class LLM(BaseLLM):
|
||||
kwargs_dict = dict(kwargs)
|
||||
kwargs_dict['backend'] = 'pytorch'
|
||||
super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
|
||||
trust_remote_code, tensor_parallel_size,
|
||||
pipeline_parallel_size, dtype, revision,
|
||||
tokenizer_revision, **kwargs_dict)
|
||||
trust_remote_code, tensor_parallel_size, dtype,
|
||||
revision, tokenizer_revision, **kwargs_dict)
|
||||
|
||||
@ -98,7 +98,6 @@ class LLM:
|
||||
skip_tokenizer_init: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
dtype: str = "auto",
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
@ -116,7 +115,6 @@ class LLM:
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
pipeline_parallel_size=pipeline_parallel_size,
|
||||
dtype=dtype,
|
||||
revision=revision,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
|
||||
@ -650,8 +650,6 @@ LLMARGS_EXPLICIT_DOCSTRING = """
|
||||
|
||||
tensor_parallel_size(int): The number of processes for tensor parallelism. Defaults to 1.
|
||||
|
||||
pipeline_parallel_size(int): The number of processes for pipeline parallelism. Defaults to 1.
|
||||
|
||||
dtype (str): The data type for the model weights and activations. Defaults to "auto".
|
||||
Can be "float16", "bfloat16", "float32", or "auto". If "auto", the data type will be automatically inferred from the source model.
|
||||
If the source data type is "float32", it will be converted to "float16".
|
||||
@ -662,6 +660,8 @@ LLMARGS_EXPLICIT_DOCSTRING = """
|
||||
"""
|
||||
|
||||
LLMARGS_IMPLICIT_DOCSTRING = """
|
||||
pipeline_parallel_size(int): The number of processes for pipeline parallelism. Defaults to 1.
|
||||
|
||||
context_parallel_size (int): The context parallel size. Defaults to 1.
|
||||
|
||||
gpus_per_node (int, optional): The number of GPUs per node. None means automatic configure. Defaults to None.
|
||||
@ -769,8 +769,6 @@ class LlmArgs:
|
||||
|
||||
tensor_parallel_size: int = 1
|
||||
|
||||
pipeline_parallel_size: int = 1
|
||||
|
||||
dtype: str = "auto"
|
||||
|
||||
revision: Optional[str] = None
|
||||
@ -778,6 +776,8 @@ class LlmArgs:
|
||||
tokenizer_revision: Optional[str] = None
|
||||
|
||||
# Below are all remaining arguments
|
||||
pipeline_parallel_size: int = 1
|
||||
|
||||
context_parallel_size: int = 1
|
||||
|
||||
gpus_per_node: Optional[int] = None
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# autoflake: skip_file
|
||||
import copy
|
||||
import inspect
|
||||
import os
|
||||
import pathlib
|
||||
@ -28,13 +29,12 @@ def repr_annotation(field_type: type) -> str:
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ParamSnapshot:
|
||||
name: str
|
||||
annotation: type
|
||||
default: Any = None
|
||||
|
||||
@classmethod
|
||||
def from_inspect(cls, param: inspect.Parameter):
|
||||
return cls(param.name, param.annotation, param.default)
|
||||
return cls(param.annotation, param.default)
|
||||
|
||||
@classmethod
|
||||
def from_docstring(cls, param: docstring_parser.common.DocstringParam):
|
||||
@ -57,7 +57,7 @@ class ParamSnapshot:
|
||||
except (NameError, SyntaxError):
|
||||
default = param.default
|
||||
|
||||
return cls(param.arg_name, annotation, default)
|
||||
return cls(annotation, default)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict):
|
||||
@ -77,19 +77,17 @@ class ParamSnapshot:
|
||||
return d
|
||||
|
||||
def assert_equal(self, other: 'ParamSnapshot'):
|
||||
assert self.name == other.name
|
||||
assert self.annotation == other.annotation
|
||||
assert self.default == other.default
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class MethodSnapshot:
|
||||
name: str
|
||||
parameters: Dict[str, ParamSnapshot]
|
||||
return_annotation: type
|
||||
|
||||
@classmethod
|
||||
def from_inspect(cls, name: str, method: MethodType):
|
||||
def from_inspect(cls, method: MethodType):
|
||||
signature = inspect.signature(method)
|
||||
parameters = {}
|
||||
for param_name, param in signature.parameters.items():
|
||||
@ -99,10 +97,10 @@ class MethodSnapshot:
|
||||
return_annotation = signature.return_annotation
|
||||
if isinstance(return_annotation, str):
|
||||
return_annotation = eval(return_annotation)
|
||||
return cls(name, parameters, return_annotation)
|
||||
return cls(parameters, return_annotation)
|
||||
|
||||
@classmethod
|
||||
def from_docstring(cls, name: str, method: MethodType):
|
||||
def from_docstring(cls, method: MethodType):
|
||||
doc = docstring_parser.parse(method.__doc__)
|
||||
parameters = {}
|
||||
for param in doc.params:
|
||||
@ -112,7 +110,7 @@ class MethodSnapshot:
|
||||
return_annotation = None
|
||||
else:
|
||||
return_annotation = eval(doc.returns.type_name)
|
||||
return cls(name, parameters, return_annotation)
|
||||
return cls(parameters, return_annotation)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict):
|
||||
@ -132,13 +130,23 @@ class MethodSnapshot:
|
||||
d['return_annotation'] = repr_annotation(d['return_annotation'])
|
||||
return d
|
||||
|
||||
def merge(self, other: 'MethodSnapshot'):
|
||||
assert self.parameters.keys().isdisjoint(other.parameters.keys())
|
||||
self.parameters.update(copy.deepcopy(other.parameters))
|
||||
assert self.return_annotation == other.return_annotation
|
||||
|
||||
def assert_equal(self, other: 'MethodSnapshot'):
|
||||
assert self.name == other.name
|
||||
assert self.parameters.keys() == other.parameters.keys()
|
||||
for name, param in self.parameters.items():
|
||||
param.assert_equal(other.parameters[name])
|
||||
assert self.return_annotation == other.return_annotation
|
||||
|
||||
def assert_containing(self, other: 'MethodSnapshot'):
|
||||
for name, param in other.parameters.items():
|
||||
assert name in self.parameters
|
||||
self.parameters[name].assert_equal(param)
|
||||
assert self.return_annotation == other.return_annotation
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ClassSnapshot:
|
||||
@ -153,16 +161,14 @@ class ClassSnapshot:
|
||||
inst, predicate=inspect.ismethod):
|
||||
if method_name.startswith("_") and method_name != "__init__":
|
||||
continue
|
||||
methods[method_name] = MethodSnapshot.from_inspect(
|
||||
method_name, method)
|
||||
methods[method_name] = MethodSnapshot.from_inspect(method)
|
||||
properties = {}
|
||||
for prop_name, prop in inspect.getmembers(
|
||||
snapshot_cls, predicate=lambda x: isinstance(x, property)):
|
||||
if prop_name.startswith("_"):
|
||||
continue
|
||||
annotation = inspect.signature(prop.fget).return_annotation
|
||||
properties[prop_name] = ParamSnapshot(prop_name, annotation,
|
||||
inspect._empty)
|
||||
properties[prop_name] = ParamSnapshot(annotation, inspect._empty)
|
||||
return cls(methods, properties)
|
||||
|
||||
@classmethod
|
||||
@ -175,10 +181,9 @@ class ClassSnapshot:
|
||||
continue
|
||||
if method_name == "__init__":
|
||||
methods["__init__"] = MethodSnapshot.from_docstring(
|
||||
"__init__", snapshot_cls)
|
||||
snapshot_cls)
|
||||
else:
|
||||
methods[method_name] = MethodSnapshot.from_docstring(
|
||||
method_name, method)
|
||||
methods[method_name] = MethodSnapshot.from_docstring(method)
|
||||
properties = {}
|
||||
doc = docstring_parser.parse(snapshot_cls.__doc__)
|
||||
for param in doc.params:
|
||||
@ -210,6 +215,19 @@ class ClassSnapshot:
|
||||
}
|
||||
return d
|
||||
|
||||
def merge(self, other: 'ClassSnapshot'):
|
||||
for name, method in self.methods.items():
|
||||
if name in other.methods:
|
||||
method.merge(other.methods[name])
|
||||
new_methods = {
|
||||
name: method
|
||||
for name, method in other.methods.items()
|
||||
if name not in self.methods
|
||||
}
|
||||
self.methods.update(copy.deepcopy(new_methods))
|
||||
assert self.properties.keys().isdisjoint(other.properties.keys())
|
||||
self.properties.update(copy.deepcopy(other.properties))
|
||||
|
||||
def assert_equal(self, other: 'ClassSnapshot'):
|
||||
assert self.methods.keys() == other.methods.keys()
|
||||
for name, method in self.methods.items():
|
||||
@ -218,30 +236,47 @@ class ClassSnapshot:
|
||||
for name, prop in self.properties.items():
|
||||
prop.assert_equal(other.properties[name])
|
||||
|
||||
def assert_containing(self, other: 'ClassSnapshot'):
|
||||
for name, method in other.methods.items():
|
||||
assert name in self.methods
|
||||
self.methods[name].assert_containing(method)
|
||||
for name, prop in other.properties.items():
|
||||
assert name in self.properties
|
||||
self.properties[name].assert_equal(prop)
|
||||
|
||||
|
||||
class ApiStabilityTestHarness:
|
||||
TEST_CLASS = None
|
||||
REFERENCE_COMMITTED_DIR = f"{os.path.dirname(__file__)}/references_committed"
|
||||
REFERENCE_DIR = f"{os.path.dirname(__file__)}/references"
|
||||
REFERENCE_FILE = None
|
||||
|
||||
@classmethod
|
||||
def reference_path(cls):
|
||||
return f"{cls.REFERENCE_DIR}/{cls.REFERENCE_FILE}"
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
with open(cls.reference_path()) as f:
|
||||
with open(f"{cls.REFERENCE_DIR}/{cls.REFERENCE_FILE}") as f:
|
||||
cls.reference = ClassSnapshot.from_dict(yaml.safe_load(f))
|
||||
cls.error_msg = (
|
||||
f"API stability validation failed. "
|
||||
f"This is probably because you changed {cls.TEST_CLASS.__name__}'s APIs, please ask for reviews from the code owners."
|
||||
)
|
||||
if os.path.exists(
|
||||
f"{cls.REFERENCE_COMMITTED_DIR}/{cls.REFERENCE_FILE}"):
|
||||
with open(
|
||||
f"{cls.REFERENCE_COMMITTED_DIR}/{cls.REFERENCE_FILE}") as f:
|
||||
cls.reference_committed = ClassSnapshot.from_dict(
|
||||
yaml.safe_load(f))
|
||||
cls.reference.merge(cls.reference_committed)
|
||||
else:
|
||||
cls.reference_committed = None
|
||||
cls.error_msg = f"API validation failed because you changed {cls.TEST_CLASS.__name__}'s APIs, please ask for reviews from the code owners."
|
||||
cls.error_msg_committed = f"API validation failed because you changed {cls.TEST_CLASS.__name__}'s committed APIs, please ask for approval."
|
||||
|
||||
def create_snapshot_from_inspect(self):
|
||||
return ClassSnapshot.from_inspect(self.TEST_CLASS)
|
||||
|
||||
def test_signature(self):
|
||||
snapshot = self.create_snapshot_from_inspect()
|
||||
if self.reference_committed is not None:
|
||||
try:
|
||||
snapshot.assert_containing(self.reference_committed)
|
||||
except AssertionError as e:
|
||||
raise AssertionError(self.error_msg_committed) from e
|
||||
try:
|
||||
snapshot.assert_equal(self.reference)
|
||||
except AssertionError as e:
|
||||
@ -252,6 +287,11 @@ class ApiStabilityTestHarness:
|
||||
|
||||
def test_docstring(self):
|
||||
snapshot = self.create_snapshot_from_docstring()
|
||||
if self.reference_committed is not None:
|
||||
try:
|
||||
snapshot.assert_containing(self.reference_committed)
|
||||
except AssertionError as e:
|
||||
raise AssertionError(self.error_msg_committed) from e
|
||||
try:
|
||||
snapshot.assert_equal(self.reference)
|
||||
except AssertionError as e:
|
||||
|
||||
@ -1,26 +1,20 @@
|
||||
methods:
|
||||
__call__:
|
||||
name: __call__
|
||||
parameters:
|
||||
client_ids:
|
||||
annotation: List[Optional[int]]
|
||||
default: inspect._empty
|
||||
name: client_ids
|
||||
logits:
|
||||
annotation: List[torch.Tensor]
|
||||
default: inspect._empty
|
||||
name: logits
|
||||
req_ids:
|
||||
annotation: List[int]
|
||||
default: inspect._empty
|
||||
name: req_ids
|
||||
stream_ptr:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: stream_ptr
|
||||
token_ids:
|
||||
annotation: List[List[List[int]]]
|
||||
default: inspect._empty
|
||||
name: token_ids
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
|
||||
@ -1,46 +1,35 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
calib_batch_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: calib_batch_size
|
||||
calib_batches:
|
||||
annotation: int
|
||||
default: 512
|
||||
name: calib_batches
|
||||
calib_dataset:
|
||||
annotation: str
|
||||
default: cnn_dailymail
|
||||
name: calib_dataset
|
||||
calib_max_seq_length:
|
||||
annotation: int
|
||||
default: 512
|
||||
name: calib_max_seq_length
|
||||
device:
|
||||
annotation: Literal['cuda', 'cpu']
|
||||
default: cuda
|
||||
name: device
|
||||
random_seed:
|
||||
annotation: int
|
||||
default: 1234
|
||||
name: random_seed
|
||||
tokenizer_max_seq_length:
|
||||
annotation: int
|
||||
default: 2048
|
||||
name: tokenizer_max_seq_length
|
||||
return_annotation: None
|
||||
from_dict:
|
||||
name: from_dict
|
||||
parameters:
|
||||
config:
|
||||
annotation: dict
|
||||
default: inspect._empty
|
||||
name: config
|
||||
return_annotation: tensorrt_llm.llmapi.llm_utils.CalibConfig
|
||||
to_dict:
|
||||
name: to_dict
|
||||
parameters: {}
|
||||
return_annotation: dict
|
||||
properties: {}
|
||||
|
||||
@ -1,58 +1,26 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
cumulative_logprob:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: cumulative_logprob
|
||||
disaggregated_params:
|
||||
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
||||
default: null
|
||||
name: disaggregated_params
|
||||
finish_reason:
|
||||
annotation: Optional[Literal['stop', 'length', 'timeout', 'cancelled']]
|
||||
default: null
|
||||
name: finish_reason
|
||||
generation_logits:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: null
|
||||
name: generation_logits
|
||||
index:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: index
|
||||
logprobs:
|
||||
annotation: Optional[List[float]]
|
||||
default: null
|
||||
name: logprobs
|
||||
stop_reason:
|
||||
annotation: Union[int, str, NoneType]
|
||||
disaggregated_params:
|
||||
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
||||
default: null
|
||||
name: stop_reason
|
||||
text:
|
||||
annotation: str
|
||||
default: ''
|
||||
name: text
|
||||
token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
name: token_ids
|
||||
return_annotation: None
|
||||
properties:
|
||||
length:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: length
|
||||
logprobs_diff:
|
||||
annotation: List[float]
|
||||
default: inspect._empty
|
||||
name: logprobs_diff
|
||||
text_diff:
|
||||
annotation: str
|
||||
default: inspect._empty
|
||||
name: text_diff
|
||||
token_ids_diff:
|
||||
annotation: List[int]
|
||||
default: inspect._empty
|
||||
name: token_ids_diff
|
||||
|
||||
@ -1,22 +1,17 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
grammar:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: grammar
|
||||
json:
|
||||
annotation: Union[str, pydantic.main.BaseModel, dict, NoneType]
|
||||
default: null
|
||||
name: json
|
||||
json_object:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: json_object
|
||||
regex:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: regex
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
|
||||
@ -1,182 +1,69 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
dtype:
|
||||
annotation: str
|
||||
default: auto
|
||||
name: dtype
|
||||
kwargs:
|
||||
annotation: Any
|
||||
default: inspect._empty
|
||||
name: kwargs
|
||||
model:
|
||||
annotation: Union[str, pathlib.Path]
|
||||
default: inspect._empty
|
||||
name: model
|
||||
revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: revision
|
||||
skip_tokenizer_init:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: skip_tokenizer_init
|
||||
tensor_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: tensor_parallel_size
|
||||
pipeline_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: pipeline_parallel_size
|
||||
tokenizer:
|
||||
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
|
||||
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
|
||||
default: null
|
||||
name: tokenizer
|
||||
tokenizer_mode:
|
||||
annotation: Literal['auto', 'slow']
|
||||
default: auto
|
||||
name: tokenizer_mode
|
||||
tokenizer_revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: tokenizer_revision
|
||||
trust_remote_code:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: trust_remote_code
|
||||
parameters: {}
|
||||
return_annotation: None
|
||||
generate:
|
||||
name: generate
|
||||
parameters:
|
||||
# TODO [TRTLLM-3925]
|
||||
disaggregated_params:
|
||||
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
||||
default: null
|
||||
name: disaggregated_params
|
||||
inputs:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
|
||||
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]]]
|
||||
default: inspect._empty
|
||||
name: inputs
|
||||
kv_cache_retention_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
|
||||
default: null
|
||||
name: kv_cache_retention_config
|
||||
lora_request:
|
||||
annotation: Union[tensorrt_llm.executor.request.LoRARequest, Sequence[tensorrt_llm.executor.request.LoRARequest],
|
||||
NoneType]
|
||||
default: null
|
||||
name: lora_request
|
||||
prompt_adapter_request:
|
||||
annotation: Union[tensorrt_llm.executor.request.PromptAdapterRequest, Sequence[tensorrt_llm.executor.request.PromptAdapterRequest],
|
||||
NoneType]
|
||||
default: null
|
||||
name: prompt_adapter_request
|
||||
queries:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
|
||||
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]],
|
||||
NoneType]
|
||||
default: null
|
||||
name: queries
|
||||
sampling_params:
|
||||
annotation: Union[tensorrt_llm.sampling_params.SamplingParams, List[tensorrt_llm.sampling_params.SamplingParams],
|
||||
NoneType]
|
||||
default: null
|
||||
name: sampling_params
|
||||
use_tqdm:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: use_tqdm
|
||||
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
|
||||
generate_async:
|
||||
name: generate_async
|
||||
parameters:
|
||||
disaggregated_params:
|
||||
annotation: Optional[tensorrt_llm.disaggregated_params.DisaggregatedParams]
|
||||
default: null
|
||||
name: disaggregated_params
|
||||
inputs:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]
|
||||
default: inspect._empty
|
||||
name: inputs
|
||||
kv_cache_retention_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.KvCacheRetentionConfig]
|
||||
default: null
|
||||
name: kv_cache_retention_config
|
||||
lora_request:
|
||||
annotation: Optional[tensorrt_llm.executor.request.LoRARequest]
|
||||
default: null
|
||||
name: lora_request
|
||||
prompt_adapter_request:
|
||||
annotation: Optional[tensorrt_llm.executor.request.PromptAdapterRequest]
|
||||
default: null
|
||||
name: prompt_adapter_request
|
||||
queries:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
|
||||
NoneType]
|
||||
default: null
|
||||
name: queries
|
||||
sampling_params:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.SamplingParams]
|
||||
default: null
|
||||
name: sampling_params
|
||||
streaming:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: streaming
|
||||
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
|
||||
get_kv_cache_events:
|
||||
name: get_kv_cache_events
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: 2
|
||||
name: timeout
|
||||
return_annotation: List[dict]
|
||||
get_kv_cache_events_async:
|
||||
name: get_kv_cache_events_async
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: 2
|
||||
name: timeout
|
||||
return_annotation: tensorrt_llm.executor.result.IterationResult
|
||||
get_stats:
|
||||
name: get_stats
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: 2
|
||||
name: timeout
|
||||
return_annotation: List[dict]
|
||||
get_stats_async:
|
||||
name: get_stats_async
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: 2
|
||||
name: timeout
|
||||
return_annotation: tensorrt_llm.executor.result.IterationResult
|
||||
save:
|
||||
name: save
|
||||
parameters:
|
||||
engine_dir:
|
||||
annotation: str
|
||||
default: inspect._empty
|
||||
name: engine_dir
|
||||
return_annotation: None
|
||||
shutdown:
|
||||
name: shutdown
|
||||
parameters: {}
|
||||
return_annotation: None
|
||||
properties:
|
||||
tokenizer:
|
||||
annotation: Optional[tensorrt_llm.llmapi.tokenizer.TokenizerBase]
|
||||
default: inspect._empty
|
||||
name: tokenizer
|
||||
workspace:
|
||||
annotation: pathlib.Path
|
||||
default: inspect._empty
|
||||
name: workspace
|
||||
|
||||
@ -1,220 +1,87 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
auto_parallel:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: auto_parallel
|
||||
auto_parallel_world_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: auto_parallel_world_size
|
||||
backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: backend
|
||||
batched_logits_processor:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
|
||||
default: null
|
||||
name: batched_logits_processor
|
||||
batching_type:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.BatchingType]
|
||||
default: null
|
||||
name: batching_type
|
||||
build_config:
|
||||
annotation: Optional[tensorrt_llm.builder.BuildConfig]
|
||||
default: null
|
||||
name: build_config
|
||||
calib_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
|
||||
default: null
|
||||
name: calib_config
|
||||
context_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: context_parallel_size
|
||||
# Parallelism
|
||||
cp_config:
|
||||
annotation: Optional[dict]
|
||||
default: null
|
||||
name: cp_config
|
||||
decoding_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.DecodingConfig]
|
||||
default: null
|
||||
name: decoding_config
|
||||
dtype:
|
||||
annotation: str
|
||||
default: auto
|
||||
name: dtype
|
||||
auto_parallel:
|
||||
annotation: bool
|
||||
default: false
|
||||
auto_parallel_world_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
embedding_parallel_mode:
|
||||
annotation: str
|
||||
default: SHARDING_ALONG_VOCAB
|
||||
name: embedding_parallel_mode
|
||||
enable_attention_dp:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: enable_attention_dp
|
||||
# Engine building
|
||||
build_config:
|
||||
annotation: Optional[tensorrt_llm.builder.BuildConfig]
|
||||
default: null
|
||||
enable_build_cache:
|
||||
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
|
||||
default: false
|
||||
name: enable_build_cache
|
||||
enable_chunked_prefill:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: enable_chunked_prefill
|
||||
enable_lora:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: enable_lora
|
||||
enable_prompt_adapter:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: enable_prompt_adapter
|
||||
enable_tqdm:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: enable_tqdm
|
||||
extended_runtime_perf_knob_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.ExtendedRuntimePerfKnobConfig]
|
||||
default: null
|
||||
name: extended_runtime_perf_knob_config
|
||||
fast_build:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: fast_build
|
||||
gather_generation_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: gather_generation_logits
|
||||
gpus_per_node:
|
||||
annotation: Optional[int]
|
||||
# Bindings and mirrored configs
|
||||
batching_type:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.BatchingType]
|
||||
default: null
|
||||
name: gpus_per_node
|
||||
guided_decoding_backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: guided_decoding_backend
|
||||
iter_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: iter_stats_max_iterations
|
||||
kv_cache_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConfig]
|
||||
default: null
|
||||
name: kv_cache_config
|
||||
load_format:
|
||||
annotation: Literal['auto', 'dummy']
|
||||
default: auto
|
||||
name: load_format
|
||||
max_batch_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: max_batch_size
|
||||
max_cpu_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
name: max_cpu_loras
|
||||
max_lora_rank:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: max_lora_rank
|
||||
max_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
name: max_loras
|
||||
max_num_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: max_num_tokens
|
||||
max_prompt_adapter_token:
|
||||
annotation: int
|
||||
default: 0
|
||||
name: max_prompt_adapter_token
|
||||
model:
|
||||
annotation: Union[str, pathlib.Path]
|
||||
default: inspect._empty
|
||||
name: model
|
||||
moe_expert_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: moe_expert_parallel_size
|
||||
moe_tensor_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: moe_tensor_parallel_size
|
||||
normalize_log_probs:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: normalize_log_probs
|
||||
peft_cache_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
|
||||
default: null
|
||||
name: peft_cache_config
|
||||
pipeline_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: pipeline_parallel_size
|
||||
quant_config:
|
||||
annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
|
||||
default: null
|
||||
name: quant_config
|
||||
request_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: request_stats_max_iterations
|
||||
revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: revision
|
||||
scheduler_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.SchedulerConfig]
|
||||
default: null
|
||||
name: scheduler_config
|
||||
skip_tokenizer_init:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: skip_tokenizer_init
|
||||
speculative_config:
|
||||
annotation: Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_utils.MedusaDecodingConfig,
|
||||
tensorrt_llm.llmapi.llm_utils.EagleDecodingConfig, tensorrt_llm.llmapi.MTPDecodingConfig, NoneType]
|
||||
extended_runtime_perf_knob_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.ExtendedRuntimePerfKnobConfig]
|
||||
default: null
|
||||
name: speculative_config
|
||||
tensor_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: tensor_parallel_size
|
||||
tokenizer:
|
||||
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
|
||||
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
|
||||
decoding_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.DecodingConfig]
|
||||
default: null
|
||||
name: tokenizer
|
||||
tokenizer_mode:
|
||||
annotation: Literal['auto', 'slow']
|
||||
default: auto
|
||||
name: tokenizer_mode
|
||||
tokenizer_revision:
|
||||
# Misc
|
||||
backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: tokenizer_revision
|
||||
trust_remote_code:
|
||||
max_batch_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
max_num_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
enable_attention_dp:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: trust_remote_code
|
||||
normalize_log_probs:
|
||||
annotation: bool
|
||||
default: false
|
||||
gather_generation_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
gpus_per_node:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
iter_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
request_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
workspace:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
name: workspace
|
||||
return_annotation: None
|
||||
from_kwargs:
|
||||
name: from_kwargs
|
||||
parameters:
|
||||
kwargs:
|
||||
annotation: Any
|
||||
default: inspect._empty
|
||||
name: kwargs
|
||||
return_annotation: tensorrt_llm.llmapi.llm_utils.LlmArgs
|
||||
to_dict:
|
||||
name: to_dict
|
||||
parameters: {}
|
||||
return_annotation: dict
|
||||
properties: {}
|
||||
|
||||
@ -1,26 +1,20 @@
|
||||
methods:
|
||||
__call__:
|
||||
name: __call__
|
||||
parameters:
|
||||
client_id:
|
||||
annotation: Optional[int]
|
||||
default: inspect._empty
|
||||
name: client_id
|
||||
logits:
|
||||
annotation: torch.Tensor
|
||||
default: inspect._empty
|
||||
name: logits
|
||||
req_id:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: req_id
|
||||
stream_ptr:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: stream_ptr
|
||||
token_ids:
|
||||
annotation: List[List[int]]
|
||||
default: inspect._empty
|
||||
name: token_ids
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
|
||||
@ -1,54 +1,41 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
clamp_val:
|
||||
annotation: Optional[List[float]]
|
||||
default: null
|
||||
name: clamp_val
|
||||
exclude_modules:
|
||||
annotation: Optional[List[str]]
|
||||
default: null
|
||||
name: exclude_modules
|
||||
group_size:
|
||||
annotation: int
|
||||
default: 128
|
||||
name: group_size
|
||||
has_zero_point:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: has_zero_point
|
||||
kv_cache_quant_algo:
|
||||
annotation: Optional[tensorrt_llm.quantization.mode.QuantAlgo]
|
||||
default: null
|
||||
name: kv_cache_quant_algo
|
||||
pre_quant_scale:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: pre_quant_scale
|
||||
quant_algo:
|
||||
annotation: Optional[tensorrt_llm.quantization.mode.QuantAlgo]
|
||||
default: null
|
||||
name: quant_algo
|
||||
smoothquant_val:
|
||||
annotation: float
|
||||
default: 0.5
|
||||
name: smoothquant_val
|
||||
use_meta_recipe:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: use_meta_recipe
|
||||
return_annotation: None
|
||||
from_dict:
|
||||
name: from_dict
|
||||
parameters:
|
||||
config:
|
||||
annotation: dict
|
||||
default: inspect._empty
|
||||
name: config
|
||||
return_annotation: tensorrt_llm.models.modeling_utils.QuantConfig
|
||||
to_dict:
|
||||
name: to_dict
|
||||
parameters: {}
|
||||
return_annotation: dict
|
||||
properties: {}
|
||||
|
||||
@ -1,50 +1,11 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters: {}
|
||||
return_annotation: None
|
||||
abort:
|
||||
name: abort
|
||||
parameters: {}
|
||||
return_annotation: None
|
||||
aborted:
|
||||
name: aborted
|
||||
parameters: {}
|
||||
return_annotation: bool
|
||||
aresult:
|
||||
name: aresult
|
||||
parameters: {}
|
||||
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
||||
result:
|
||||
name: result
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: None
|
||||
name: timeout
|
||||
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
||||
properties:
|
||||
context_logits:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: inspect._empty
|
||||
name: context_logits
|
||||
finished:
|
||||
annotation: bool
|
||||
default: inspect._empty
|
||||
name: finished
|
||||
outputs:
|
||||
annotation: List[tensorrt_llm.executor.result.CompletionOutput]
|
||||
default: inspect._empty
|
||||
name: outputs
|
||||
prompt:
|
||||
annotation: Optional[str]
|
||||
default: inspect._empty
|
||||
name: prompt
|
||||
prompt_token_ids:
|
||||
annotation: List[int]
|
||||
default: inspect._empty
|
||||
name: prompt_token_ids
|
||||
request_id:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
name: request_id
|
||||
properties: {}
|
||||
|
||||
@ -1,206 +1,28 @@
|
||||
methods:
|
||||
__init__:
|
||||
name: __init__
|
||||
parameters:
|
||||
add_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: add_special_tokens
|
||||
# Experimental features
|
||||
additional_model_outputs:
|
||||
annotation: Optional[List[tensorrt_llm.sampling_params.AdditionalModelOutput]]
|
||||
default: null
|
||||
name: additional_model_outputs
|
||||
apply_batched_logits_processor:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: apply_batched_logits_processor
|
||||
bad:
|
||||
annotation: Union[List[str], str, NoneType]
|
||||
default: null
|
||||
name: bad
|
||||
bad_token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
name: bad_token_ids
|
||||
beam_search_diversity_rate:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: beam_search_diversity_rate
|
||||
beam_width:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: beam_width
|
||||
best_of:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: best_of
|
||||
detokenize:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: detokenize
|
||||
early_stopping:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: early_stopping
|
||||
embedding_bias:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: null
|
||||
name: embedding_bias
|
||||
end_id:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: end_id
|
||||
exclude_input_from_output:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: exclude_input_from_output
|
||||
frequency_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: frequency_penalty
|
||||
guided_decoding:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.GuidedDecodingParams]
|
||||
default: null
|
||||
name: guided_decoding
|
||||
ignore_eos:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: ignore_eos
|
||||
include_stop_str_in_output:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: include_stop_str_in_output
|
||||
length_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: length_penalty
|
||||
logits_processor:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.LogitsProcessor]
|
||||
default: null
|
||||
name: logits_processor
|
||||
lookahead_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.LookaheadDecodingConfig]
|
||||
default: null
|
||||
name: lookahead_config
|
||||
max_new_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: max_new_tokens
|
||||
max_tokens:
|
||||
annotation: int
|
||||
default: 32
|
||||
name: max_tokens
|
||||
min_length:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: min_length
|
||||
min_p:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: min_p
|
||||
min_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: min_tokens
|
||||
n:
|
||||
annotation: int
|
||||
default: 1
|
||||
name: n
|
||||
no_repeat_ngram_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: no_repeat_ngram_size
|
||||
num_return_sequences:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: num_return_sequences
|
||||
pad_id:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: pad_id
|
||||
presence_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: presence_penalty
|
||||
random_seed:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: random_seed
|
||||
repetition_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: repetition_penalty
|
||||
return_context_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: return_context_logits
|
||||
return_encoder_output:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: return_encoder_output
|
||||
return_generation_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: return_generation_logits
|
||||
return_log_probs:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: return_log_probs
|
||||
return_perf_metrics:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: return_perf_metrics
|
||||
seed:
|
||||
# TODO [TRTLLM-3716]: Deprecated arguments
|
||||
beam_width:
|
||||
annotation: int
|
||||
default: 1
|
||||
max_new_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: seed
|
||||
skip_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: skip_special_tokens
|
||||
spaces_between_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
name: spaces_between_special_tokens
|
||||
stop:
|
||||
annotation: Union[List[str], str, NoneType]
|
||||
default: null
|
||||
name: stop
|
||||
stop_token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
name: stop_token_ids
|
||||
temperature:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: temperature
|
||||
top_k:
|
||||
min_length:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: top_k
|
||||
top_p:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: top_p
|
||||
top_p_decay:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: top_p_decay
|
||||
top_p_min:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
name: top_p_min
|
||||
top_p_reset_ids:
|
||||
num_return_sequences:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: top_p_reset_ids
|
||||
truncate_prompt_tokens:
|
||||
random_seed:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
name: truncate_prompt_tokens
|
||||
use_beam_search:
|
||||
annotation: bool
|
||||
default: false
|
||||
name: use_beam_search
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
|
||||
@ -0,0 +1,27 @@
|
||||
methods:
|
||||
__init__:
|
||||
parameters:
|
||||
index:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
text:
|
||||
annotation: str
|
||||
default: ''
|
||||
token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
finish_reason:
|
||||
annotation: Optional[Literal['stop', 'length', 'timeout', 'cancelled']]
|
||||
default: null
|
||||
stop_reason:
|
||||
annotation: Union[int, str, NoneType]
|
||||
default: null
|
||||
generation_logits:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: null
|
||||
# TODO [TRTLLM-1049]
|
||||
# logprobs:
|
||||
# annotation: Optional[SampleLogprobs]
|
||||
# default: null
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
79
tests/unittest/api_stability/references_committed/llm.yaml
Normal file
79
tests/unittest/api_stability/references_committed/llm.yaml
Normal file
@ -0,0 +1,79 @@
|
||||
methods:
|
||||
__init__:
|
||||
parameters:
|
||||
model:
|
||||
annotation: Union[str, pathlib.Path]
|
||||
default: inspect._empty
|
||||
tokenizer:
|
||||
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
|
||||
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
|
||||
default: null
|
||||
tokenizer_mode:
|
||||
annotation: Literal['auto', 'slow']
|
||||
default: auto
|
||||
skip_tokenizer_init:
|
||||
annotation: bool
|
||||
default: false
|
||||
trust_remote_code:
|
||||
annotation: bool
|
||||
default: false
|
||||
tensor_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
dtype:
|
||||
annotation: str
|
||||
default: auto
|
||||
revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
tokenizer_revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
kwargs:
|
||||
annotation: Any
|
||||
default: inspect._empty
|
||||
return_annotation: None
|
||||
generate:
|
||||
parameters:
|
||||
inputs:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt,
|
||||
Sequence[Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]]]
|
||||
default: inspect._empty
|
||||
sampling_params:
|
||||
annotation: Union[tensorrt_llm.sampling_params.SamplingParams, List[tensorrt_llm.sampling_params.SamplingParams],
|
||||
NoneType]
|
||||
default: null
|
||||
lora_request:
|
||||
annotation: Union[tensorrt_llm.executor.request.LoRARequest, Sequence[tensorrt_llm.executor.request.LoRARequest],
|
||||
NoneType]
|
||||
default: null
|
||||
prompt_adapter_request:
|
||||
annotation: Union[tensorrt_llm.executor.request.PromptAdapterRequest, Sequence[tensorrt_llm.executor.request.PromptAdapterRequest],
|
||||
NoneType]
|
||||
default: null
|
||||
use_tqdm:
|
||||
annotation: bool
|
||||
default: true
|
||||
return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
|
||||
generate_async:
|
||||
parameters:
|
||||
inputs:
|
||||
annotation: Union[str, List[int], tensorrt_llm.inputs.data.TextPrompt, tensorrt_llm.inputs.data.TokensPrompt]
|
||||
default: inspect._empty
|
||||
sampling_params:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.SamplingParams]
|
||||
default: null
|
||||
lora_request:
|
||||
annotation: Optional[tensorrt_llm.executor.request.LoRARequest]
|
||||
default: null
|
||||
prompt_adapter_request:
|
||||
annotation: Optional[tensorrt_llm.executor.request.PromptAdapterRequest]
|
||||
default: null
|
||||
streaming:
|
||||
annotation: bool
|
||||
default: false
|
||||
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
|
||||
properties:
|
||||
tokenizer:
|
||||
annotation: Optional[tensorrt_llm.llmapi.tokenizer.TokenizerBase]
|
||||
default: inspect._empty
|
||||
@ -0,0 +1,96 @@
|
||||
methods:
|
||||
__init__:
|
||||
parameters:
|
||||
# Explicit arguments
|
||||
model:
|
||||
annotation: Union[str, pathlib.Path]
|
||||
default: inspect._empty
|
||||
tokenizer:
|
||||
annotation: Union[str, pathlib.Path, transformers.tokenization_utils_base.PreTrainedTokenizerBase,
|
||||
tensorrt_llm.llmapi.tokenizer.TokenizerBase, NoneType]
|
||||
default: null
|
||||
tokenizer_mode:
|
||||
annotation: Literal['auto', 'slow']
|
||||
default: auto
|
||||
skip_tokenizer_init:
|
||||
annotation: bool
|
||||
default: false
|
||||
trust_remote_code:
|
||||
annotation: bool
|
||||
default: false
|
||||
tensor_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
dtype:
|
||||
annotation: str
|
||||
default: auto
|
||||
revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
tokenizer_revision:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
# Parallelism
|
||||
pipeline_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
context_parallel_size:
|
||||
annotation: int
|
||||
default: 1
|
||||
moe_tensor_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
moe_expert_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
# LoRA
|
||||
enable_lora:
|
||||
annotation: bool
|
||||
default: false
|
||||
max_lora_rank:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
max_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
max_cpu_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
# Prompt tuning
|
||||
enable_prompt_adapter:
|
||||
annotation: bool
|
||||
default: false
|
||||
max_prompt_adapter_token:
|
||||
annotation: int
|
||||
default: 0
|
||||
# Logits processor and guided decoding
|
||||
batched_logits_processor:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
|
||||
default: null
|
||||
guided_decoding_backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
# Quantization and calibration
|
||||
quant_config:
|
||||
annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
|
||||
default: null
|
||||
calib_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
|
||||
default: null
|
||||
# Speculative decoding
|
||||
speculative_config:
|
||||
annotation: Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_utils.MedusaDecodingConfig,
|
||||
tensorrt_llm.llmapi.llm_utils.EagleDecodingConfig, tensorrt_llm.llmapi.MTPDecodingConfig, NoneType]
|
||||
default: null
|
||||
# Misc
|
||||
load_format:
|
||||
annotation: Literal['auto', 'dummy']
|
||||
default: auto
|
||||
enable_tqdm:
|
||||
annotation: bool
|
||||
default: false
|
||||
enable_chunked_prefill:
|
||||
annotation: bool
|
||||
default: false
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
@ -0,0 +1,33 @@
|
||||
methods:
|
||||
aresult:
|
||||
parameters: {}
|
||||
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
||||
result:
|
||||
parameters:
|
||||
timeout:
|
||||
annotation: Optional[float]
|
||||
default: None
|
||||
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
||||
properties:
|
||||
request_id:
|
||||
annotation: int
|
||||
default: inspect._empty
|
||||
prompt:
|
||||
annotation: Optional[str]
|
||||
default: inspect._empty
|
||||
prompt_token_ids:
|
||||
annotation: List[int]
|
||||
default: inspect._empty
|
||||
outputs:
|
||||
annotation: List[tensorrt_llm.executor.result.CompletionOutput]
|
||||
default: inspect._empty
|
||||
context_logits:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: inspect._empty
|
||||
finished:
|
||||
annotation: bool
|
||||
default: inspect._empty
|
||||
# TODO [TRTLLM-876]
|
||||
# prompt_logprobs:
|
||||
# annotation: Optional[PromptLogprobs]
|
||||
# default: inspect._empty
|
||||
@ -0,0 +1,142 @@
|
||||
methods:
|
||||
__init__:
|
||||
parameters:
|
||||
# General
|
||||
n:
|
||||
annotation: int
|
||||
default: 1
|
||||
best_of:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
use_beam_search:
|
||||
annotation: bool
|
||||
default: false
|
||||
beam_search_diversity_rate:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
early_stopping:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
max_tokens:
|
||||
annotation: int
|
||||
default: 32
|
||||
min_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
end_id:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
pad_id:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
# Sampling
|
||||
seed:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
temperature:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
top_k:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
top_p:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
top_p_decay:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
top_p_min:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
top_p_reset_ids:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
min_p:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
# Panelities
|
||||
repetition_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
presence_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
frequency_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
length_penalty:
|
||||
annotation: Optional[float]
|
||||
default: null
|
||||
no_repeat_ngram_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
# Stop words and bad words
|
||||
stop:
|
||||
annotation: Union[List[str], str, NoneType]
|
||||
default: null
|
||||
stop_token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
include_stop_str_in_output:
|
||||
annotation: bool
|
||||
default: false
|
||||
bad:
|
||||
annotation: Union[List[str], str, NoneType]
|
||||
default: null
|
||||
bad_token_ids:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
# Logits processor and guided decoding
|
||||
logits_processor:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.LogitsProcessor]
|
||||
default: null
|
||||
apply_batched_logits_processor:
|
||||
annotation: bool
|
||||
default: false
|
||||
guided_decoding:
|
||||
annotation: Optional[tensorrt_llm.sampling_params.GuidedDecodingParams]
|
||||
default: null
|
||||
embedding_bias:
|
||||
annotation: Optional[torch.Tensor]
|
||||
default: null
|
||||
# Speculative decoding
|
||||
lookahead_config:
|
||||
annotation: Optional[tensorrt_llm.bindings.executor.LookaheadDecodingConfig]
|
||||
default: null
|
||||
# Tokenizer behavior
|
||||
ignore_eos:
|
||||
annotation: bool
|
||||
default: false
|
||||
detokenize:
|
||||
annotation: bool
|
||||
default: true
|
||||
add_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
truncate_prompt_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
skip_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
spaces_between_special_tokens:
|
||||
annotation: bool
|
||||
default: true
|
||||
# Returning controls
|
||||
return_log_probs:
|
||||
annotation: bool
|
||||
default: false
|
||||
return_context_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
return_generation_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
exclude_input_from_output:
|
||||
annotation: bool
|
||||
default: true
|
||||
return_encoder_output:
|
||||
annotation: bool
|
||||
default: false
|
||||
return_annotation: None
|
||||
properties: {}
|
||||
@ -66,13 +66,13 @@ class TestLogitsProcessor(ApiStabilityTestHarness):
|
||||
|
||||
def create_snapshot_from_inspect(self):
|
||||
method_snapshot = MethodSnapshot.from_inspect(
|
||||
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
|
||||
MethodType(self.TEST_CLASS.__call__, object()))
|
||||
return ClassSnapshot(methods={"__call__": method_snapshot},
|
||||
properties={})
|
||||
|
||||
def create_snapshot_from_docstring(self):
|
||||
method_snapshot = MethodSnapshot.from_docstring(
|
||||
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
|
||||
MethodType(self.TEST_CLASS.__call__, object()))
|
||||
return ClassSnapshot(methods={"__call__": method_snapshot},
|
||||
properties={})
|
||||
|
||||
@ -83,13 +83,13 @@ class TestBatchedLogitsProcessor(ApiStabilityTestHarness):
|
||||
|
||||
def create_snapshot_from_inspect(self):
|
||||
method_snapshot = MethodSnapshot.from_inspect(
|
||||
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
|
||||
MethodType(self.TEST_CLASS.__call__, object()))
|
||||
return ClassSnapshot(methods={"__call__": method_snapshot},
|
||||
properties={})
|
||||
|
||||
def create_snapshot_from_docstring(self):
|
||||
method_snapshot = MethodSnapshot.from_docstring(
|
||||
"__call__", MethodType(self.TEST_CLASS.__call__, object()))
|
||||
MethodType(self.TEST_CLASS.__call__, object()))
|
||||
return ClassSnapshot(methods={"__call__": method_snapshot},
|
||||
properties={})
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user