[None][chore] expose tokens_per_block into KvCacheConfig (#5911)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2025-09-08 09:14:10 +08:00 committed by GitHub
parent 7c76dde76d
commit 205c3a144c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 7 deletions

View File

@ -5,7 +5,6 @@ from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
BaseCheckpointLoader
from tensorrt_llm.bindings.executor import ExecutorConfig
from ...builder import BuildConfig
from ...llmapi.llm_args import LoadFormat, SamplerType
from ...logger import logger
from ...mapping import Mapping
@ -119,7 +118,6 @@ EXETENDED_EXECUTOR_CONFIG_FIELDS = [
'backend',
'pytorch_backend_config',
'max_seq_len',
'tokens_per_block',
'mapping',
'hf_model_dir',
'mm_encoder_only',
@ -131,7 +129,6 @@ def update_executor_config(
backend: Optional[str] = None,
pytorch_backend_config: Optional[PyTorchConfig] = None,
mapping: Optional[Mapping] = None,
build_config: Optional[BuildConfig] = None,
speculative_config: Optional["DecodingBaseConfig"] = None,
hf_model_dir: Optional[str] = None,
max_input_len: Optional[int] = None,
@ -156,10 +153,6 @@ def update_executor_config(
logger.info(f"{executor_config.pytorch_backend_config}")
build_config = build_config or BuildConfig()
# TODO: move to pure-Python KvCacheConfig, and remove dependency on build_config.
executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block
executor_config.hf_model_dir = hf_model_dir
if max_input_len is not None:

View File

@ -1041,6 +1041,9 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
"The data type to use for the Mamba SSM cache. If set to 'auto', the data type will be inferred from the model config."
)
tokens_per_block: int = Field(default=32,
description="The number of tokens per block.")
def _to_pybind(self):
return _KvCacheConfig(
enable_block_reuse=self.enable_block_reuse,
@ -1946,6 +1949,9 @@ class BaseLlmArgs(StrictBaseModel):
from tensorrt_llm._torch.speculative import suggest_spec_config
spec_config = suggest_spec_config(max_batch_size)
if self.kv_cache_config is not None:
executor_config.tokens_per_block = self.kv_cache_config.tokens_per_block
update_executor_config(
executor_config,
backend=self.backend,