This commit is contained in:
HuiGao-NV 2026-01-13 13:54:01 +00:00 committed by GitHub
commit 4a256ca1da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 21 additions and 1 deletions

View File

@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
max_stats_len: int = Field(
default=1000,
description="The max number of performance statistic entries.",
status="prototype",
)
@property
def quant_config(self) -> QuantConfig:
if self._quant_config is None:

View File

@ -490,10 +490,12 @@ class ADEngine(ModelEngine):
self.max_beam_width = ad_config.max_beam_width
self.spec_config = ad_config.speculative_config
self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
self.llm_args.max_stats_len = ad_config.max_stats_len
else:
self.max_beam_width = 1
self.spec_config = None
self._disable_overlap_scheduler = False
self.llm_args.max_stats_len = 1000
# check for max total draft tokens
if self.spec_config is not None:

View File

@ -143,7 +143,6 @@ class PyExecutor:
super(PyExecutor, self).__init__()
self.device_id = torch.cuda.current_device()
self.global_rank = dist.rank
# Store the execution stream for model forward operations.
# This stream is used for proper synchronization with KVCacheTransferManager.
# execution_stream can be provided by create_py_executor
@ -181,6 +180,7 @@ class PyExecutor:
self.max_draft_len = max_draft_len
self.max_total_draft_tokens = max_total_draft_tokens
self.llm_args = self.model_engine.llm_args
self.max_stats_len = max(self.llm_args.max_stats_len, 1)
self.max_num_tokens = self.llm_args.max_num_tokens
self.print_log = self.llm_args.print_iter_log
self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats
@ -866,6 +866,8 @@ class PyExecutor:
req_stats: Optional[List[RequestStats]] = None):
with self.stats_lock:
if len(self.stats) > self.max_stats_len:
self.stats.pop(0)
self.stats.append((stats, req_stats))
def _process_iter_stats(

View File

@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs):
status="prototype",
)
max_stats_len: int = Field(
default=1000,
description="The max number of performance statistic entries.",
status="prototype",
)
@property
def quant_config(self) -> QuantConfig:
if self._quant_config is None:

View File

@ -227,6 +227,10 @@ methods:
annotation: Optional[Dict[str, str]]
default: null
status: prototype
max_stats_len:
annotation: int
default: 1000
status: prototype
return_annotation: None
generate:
parameters: