mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][chore] expose tokens_per_block into KvCacheConfig (#5911)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
7c76dde76d
commit
205c3a144c
@ -5,7 +5,6 @@ from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
|
||||
BaseCheckpointLoader
|
||||
from tensorrt_llm.bindings.executor import ExecutorConfig
|
||||
|
||||
from ...builder import BuildConfig
|
||||
from ...llmapi.llm_args import LoadFormat, SamplerType
|
||||
from ...logger import logger
|
||||
from ...mapping import Mapping
|
||||
@ -119,7 +118,6 @@ EXETENDED_EXECUTOR_CONFIG_FIELDS = [
|
||||
'backend',
|
||||
'pytorch_backend_config',
|
||||
'max_seq_len',
|
||||
'tokens_per_block',
|
||||
'mapping',
|
||||
'hf_model_dir',
|
||||
'mm_encoder_only',
|
||||
@ -131,7 +129,6 @@ def update_executor_config(
|
||||
backend: Optional[str] = None,
|
||||
pytorch_backend_config: Optional[PyTorchConfig] = None,
|
||||
mapping: Optional[Mapping] = None,
|
||||
build_config: Optional[BuildConfig] = None,
|
||||
speculative_config: Optional["DecodingBaseConfig"] = None,
|
||||
hf_model_dir: Optional[str] = None,
|
||||
max_input_len: Optional[int] = None,
|
||||
@ -156,10 +153,6 @@ def update_executor_config(
|
||||
|
||||
logger.info(f"{executor_config.pytorch_backend_config}")
|
||||
|
||||
build_config = build_config or BuildConfig()
|
||||
# TODO: move to pure-Python KvCacheConfig, and remove dependency on build_config.
|
||||
executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block
|
||||
|
||||
executor_config.hf_model_dir = hf_model_dir
|
||||
|
||||
if max_input_len is not None:
|
||||
|
||||
@ -1041,6 +1041,9 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
|
||||
"The data type to use for the Mamba SSM cache. If set to 'auto', the data type will be inferred from the model config."
|
||||
)
|
||||
|
||||
tokens_per_block: int = Field(default=32,
|
||||
description="The number of tokens per block.")
|
||||
|
||||
def _to_pybind(self):
|
||||
return _KvCacheConfig(
|
||||
enable_block_reuse=self.enable_block_reuse,
|
||||
@ -1946,6 +1949,9 @@ class BaseLlmArgs(StrictBaseModel):
|
||||
from tensorrt_llm._torch.speculative import suggest_spec_config
|
||||
spec_config = suggest_spec_config(max_batch_size)
|
||||
|
||||
if self.kv_cache_config is not None:
|
||||
executor_config.tokens_per_block = self.kv_cache_config.tokens_per_block
|
||||
|
||||
update_executor_config(
|
||||
executor_config,
|
||||
backend=self.backend,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user