Merge 4c0cff6f65 into 6df2c8a074

2026-01-13 22:18:36 +08:00 · 2026-01-13 13:54:01 +00:00 · 2026-01-13 13:54:01 +00:00 · 4a256ca1da
commit 4a256ca1da
parent 6df2c8a074 4c0cff6f65
5 changed files with 21 additions and 1 deletions
--- a/tensorrt_llm/_torch/auto_deploy/llm_args.py
+++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):

    _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)

+    max_stats_len: int = Field(
+        default=1000,
+        description="The max number of performance statistic entries.",
+        status="prototype",
+    )
+
    @property
    def quant_config(self) -> QuantConfig:
        if self._quant_config is None:
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@ -490,10 +490,12 @@ class ADEngine(ModelEngine):
            self.max_beam_width = ad_config.max_beam_width
            self.spec_config = ad_config.speculative_config
            self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
+            self.llm_args.max_stats_len = ad_config.max_stats_len
        else:
            self.max_beam_width = 1
            self.spec_config = None
            self._disable_overlap_scheduler = False
+            self.llm_args.max_stats_len = 1000

        # check for max total draft tokens
        if self.spec_config is not None:
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@ -143,7 +143,6 @@ class PyExecutor:
        super(PyExecutor, self).__init__()
        self.device_id = torch.cuda.current_device()
        self.global_rank = dist.rank
-
        # Store the execution stream for model forward operations.
        # This stream is used for proper synchronization with KVCacheTransferManager.
        # execution_stream can be provided by create_py_executor
@ -181,6 +180,7 @@ class PyExecutor:
        self.max_draft_len = max_draft_len
        self.max_total_draft_tokens = max_total_draft_tokens
        self.llm_args = self.model_engine.llm_args
+        self.max_stats_len = max(self.llm_args.max_stats_len, 1)
        self.max_num_tokens = self.llm_args.max_num_tokens
        self.print_log = self.llm_args.print_iter_log
        self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats
@ -866,6 +866,8 @@ class PyExecutor:
                           req_stats: Optional[List[RequestStats]] = None):

        with self.stats_lock:
+            if len(self.stats) > self.max_stats_len:
+                self.stats.pop(0)
            self.stats.append((stats, req_stats))

    def _process_iter_stats(
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs):
        status="prototype",
    )

+    max_stats_len: int = Field(
+        default=1000,
+        description="The max number of performance statistic entries.",
+        status="prototype",
+    )
+
    @property
    def quant_config(self) -> QuantConfig:
        if self._quant_config is None:
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@ -227,6 +227,10 @@ methods:
        annotation: Optional[Dict[str, str]]
        default: null
        status: prototype
+      max_stats_len:
+        annotation: int
+        default: 1000
+        status: prototype
    return_annotation: None
  generate:
    parameters: