diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index aa9f0147cd..7c46a48df3 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings): _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) + max_stats_len: int = Field( + default=1000, + description="The max number of performance statistic entries.", + status="prototype", + ) + @property def quant_config(self) -> QuantConfig: if self._quant_config is None: diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index 48066cb256..a81e0f3f5c 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -490,10 +490,12 @@ class ADEngine(ModelEngine): self.max_beam_width = ad_config.max_beam_width self.spec_config = ad_config.speculative_config self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler + self.llm_args.max_stats_len = ad_config.max_stats_len else: self.max_beam_width = 1 self.spec_config = None self._disable_overlap_scheduler = False + self.llm_args.max_stats_len = 1000 # check for max total draft tokens if self.spec_config is not None: diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 4129973363..94fa5020ca 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -143,7 +143,6 @@ class PyExecutor: super(PyExecutor, self).__init__() self.device_id = torch.cuda.current_device() self.global_rank = dist.rank - # Store the execution stream for model forward operations. # This stream is used for proper synchronization with KVCacheTransferManager. # execution_stream can be provided by create_py_executor @@ -181,6 +180,7 @@ class PyExecutor: self.max_draft_len = max_draft_len self.max_total_draft_tokens = max_total_draft_tokens self.llm_args = self.model_engine.llm_args + self.max_stats_len = max(self.llm_args.max_stats_len, 1) self.max_num_tokens = self.llm_args.max_num_tokens self.print_log = self.llm_args.print_iter_log self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats @@ -866,6 +866,8 @@ class PyExecutor: req_stats: Optional[List[RequestStats]] = None): with self.stats_lock: + if len(self.stats) > self.max_stats_len: + self.stats.pop(0) self.stats.append((stats, req_stats)) def _process_iter_stats( diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 3f15252b84..9a13eab29a 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs): status="prototype", ) + max_stats_len: int = Field( + default=1000, + description="The max number of performance statistic entries.", + status="prototype", + ) + @property def quant_config(self) -> QuantConfig: if self._quant_config is None: diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 4b6f8cedab..2e3457f3d4 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -227,6 +227,10 @@ methods: annotation: Optional[Dict[str, str]] default: null status: prototype + max_stats_len: + annotation: int + default: 1000 + status: prototype return_annotation: None generate: parameters: