[TRTLLM-10752][chore] set default val of max_num_tokens_in_buffer as max_seq_len or max_input_len (#11082)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
2026-02-16 07:53:55 +08:00 · 2026-02-06 03:54:00 +08:00 · 2026-02-06 03:54:00 +08:00 · a9d4927235
commit a9d4927235
parent a7494a5ff4
1 changed files with 4 additions and 0 deletions
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@ -449,6 +449,10 @@ def create_py_executor(
    max_num_tokens = model_engine.max_num_tokens
    sparse_attention_config = model_engine.sparse_attention_config

+    # Set default value for cache_transceiver_config.max_tokens_in_buffer
+    if cache_transceiver_config and cache_transceiver_config.max_tokens_in_buffer is None:
+        cache_transceiver_config.max_tokens_in_buffer = net_max_seq_len
+
    config = model_engine.model.model_config.pretrained_config
    if is_mla(config):
        if model_engine.model.model_config.enable_flash_mla: