[None][chore] expose tokens_per_block into KvCacheConfig (#5911)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-09-08 09:14:10 +08:00 · 2025-09-08 09:14:10 +08:00 · 205c3a144c
commit 205c3a144c
parent 7c76dde76d
2 changed files with 6 additions and 7 deletions
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@ -5,7 +5,6 @@ from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
    BaseCheckpointLoader
 from tensorrt_llm.bindings.executor import ExecutorConfig

-from ...builder import BuildConfig
 from ...llmapi.llm_args import LoadFormat, SamplerType
 from ...logger import logger
 from ...mapping import Mapping
@ -119,7 +118,6 @@ EXETENDED_EXECUTOR_CONFIG_FIELDS = [
    'backend',
    'pytorch_backend_config',
    'max_seq_len',
-    'tokens_per_block',
    'mapping',
    'hf_model_dir',
    'mm_encoder_only',
@ -131,7 +129,6 @@ def update_executor_config(
        backend: Optional[str] = None,
        pytorch_backend_config: Optional[PyTorchConfig] = None,
        mapping: Optional[Mapping] = None,
-        build_config: Optional[BuildConfig] = None,
        speculative_config: Optional["DecodingBaseConfig"] = None,
        hf_model_dir: Optional[str] = None,
        max_input_len: Optional[int] = None,
@ -156,10 +153,6 @@ def update_executor_config(

    logger.info(f"{executor_config.pytorch_backend_config}")

-    build_config = build_config or BuildConfig()
-    # TODO: move to pure-Python KvCacheConfig, and remove dependency on build_config.
-    executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block
-
    executor_config.hf_model_dir = hf_model_dir

    if max_input_len is not None:
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@ -1041,6 +1041,9 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
            "The data type to use for the Mamba SSM cache. If set to 'auto', the data type will be inferred from the model config."
        )

+    tokens_per_block: int = Field(default=32,
+                                  description="The number of tokens per block.")
+
    def _to_pybind(self):
        return _KvCacheConfig(
            enable_block_reuse=self.enable_block_reuse,
@ -1946,6 +1949,9 @@ class BaseLlmArgs(StrictBaseModel):
            from tensorrt_llm._torch.speculative import suggest_spec_config
            spec_config = suggest_spec_config(max_batch_size)

+        if self.kv_cache_config is not None:
+            executor_config.tokens_per_block = self.kv_cache_config.tokens_per_block
+
        update_executor_config(
            executor_config,
            backend=self.backend,