mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Merge 4c0cff6f65 into 6df2c8a074
This commit is contained in:
commit
4a256ca1da
@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
|
||||
|
||||
_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
|
||||
|
||||
max_stats_len: int = Field(
|
||||
default=1000,
|
||||
description="The max number of performance statistic entries.",
|
||||
status="prototype",
|
||||
)
|
||||
|
||||
@property
|
||||
def quant_config(self) -> QuantConfig:
|
||||
if self._quant_config is None:
|
||||
|
||||
@ -490,10 +490,12 @@ class ADEngine(ModelEngine):
|
||||
self.max_beam_width = ad_config.max_beam_width
|
||||
self.spec_config = ad_config.speculative_config
|
||||
self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
|
||||
self.llm_args.max_stats_len = ad_config.max_stats_len
|
||||
else:
|
||||
self.max_beam_width = 1
|
||||
self.spec_config = None
|
||||
self._disable_overlap_scheduler = False
|
||||
self.llm_args.max_stats_len = 1000
|
||||
|
||||
# check for max total draft tokens
|
||||
if self.spec_config is not None:
|
||||
|
||||
@ -143,7 +143,6 @@ class PyExecutor:
|
||||
super(PyExecutor, self).__init__()
|
||||
self.device_id = torch.cuda.current_device()
|
||||
self.global_rank = dist.rank
|
||||
|
||||
# Store the execution stream for model forward operations.
|
||||
# This stream is used for proper synchronization with KVCacheTransferManager.
|
||||
# execution_stream can be provided by create_py_executor
|
||||
@ -181,6 +180,7 @@ class PyExecutor:
|
||||
self.max_draft_len = max_draft_len
|
||||
self.max_total_draft_tokens = max_total_draft_tokens
|
||||
self.llm_args = self.model_engine.llm_args
|
||||
self.max_stats_len = max(self.llm_args.max_stats_len, 1)
|
||||
self.max_num_tokens = self.llm_args.max_num_tokens
|
||||
self.print_log = self.llm_args.print_iter_log
|
||||
self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats
|
||||
@ -866,6 +866,8 @@ class PyExecutor:
|
||||
req_stats: Optional[List[RequestStats]] = None):
|
||||
|
||||
with self.stats_lock:
|
||||
if len(self.stats) > self.max_stats_len:
|
||||
self.stats.pop(0)
|
||||
self.stats.append((stats, req_stats))
|
||||
|
||||
def _process_iter_stats(
|
||||
|
||||
@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
status="prototype",
|
||||
)
|
||||
|
||||
max_stats_len: int = Field(
|
||||
default=1000,
|
||||
description="The max number of performance statistic entries.",
|
||||
status="prototype",
|
||||
)
|
||||
|
||||
@property
|
||||
def quant_config(self) -> QuantConfig:
|
||||
if self._quant_config is None:
|
||||
|
||||
@ -227,6 +227,10 @@ methods:
|
||||
annotation: Optional[Dict[str, str]]
|
||||
default: null
|
||||
status: prototype
|
||||
max_stats_len:
|
||||
annotation: int
|
||||
default: 1000
|
||||
status: prototype
|
||||
return_annotation: None
|
||||
generate:
|
||||
parameters:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user