mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Support max_stats_len in autodeploy
Signed-off-by: Hui Gao <huig@nvidia.com>
This commit is contained in:
parent
60a621e4e1
commit
94bb18a991
@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
|
||||
|
||||
_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
|
||||
|
||||
max_stats_len: int = Field(
|
||||
default=1000,
|
||||
description="The max number of performance statistic entries.",
|
||||
status="prototype",
|
||||
)
|
||||
|
||||
@property
|
||||
def quant_config(self) -> QuantConfig:
|
||||
if self._quant_config is None:
|
||||
|
||||
@ -490,10 +490,12 @@ class ADEngine(ModelEngine):
|
||||
self.max_beam_width = ad_config.max_beam_width
|
||||
self.spec_config = ad_config.speculative_config
|
||||
self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
|
||||
self.llm_args.max_stats_len = ad_config.max_stats_len
|
||||
else:
|
||||
self.max_beam_width = 1
|
||||
self.spec_config = None
|
||||
self._disable_overlap_scheduler = False
|
||||
self.llm_args.max_stats_len = 1000
|
||||
|
||||
# check for max total draft tokens
|
||||
if self.spec_config is not None:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user